563 Commits

Author SHA1 Message Date
jbrodman
8f902fde9c Merge pull request #420 from jbrodman/master
Fix for c++ backend
2013-01-08 11:55:03 -08:00
james.brodman
b6023c517e Fix/Hack to avoid the cbackend generating spurious array type declarations. 2013-01-08 14:53:17 -05:00
james.brodman
42d77e9191 Modified to mirror asin.ispc and not fail. 2013-01-08 14:33:32 -05:00
jbrodman
312a7582df Merge pull request #419 from jbrodman/master
Fix to acos.ispc test
2013-01-08 11:32:34 -08:00
jbrodman
dc939eba78 Merge pull request #418 from mmp/master
Fix build with LLVM top-of-tree, fix warnings, remove LLVM 3.0 support
2013-01-08 10:28:02 -08:00
jbrodman
f8bec51de2 Merge pull request #411 from pengtu/master
Simple fixes to allow SOA pointer and array be passed as function argument.
2013-01-08 08:40:01 -08:00
Matt Pharr
0bf1320a32 Remove support for building with LLVM 3.0 2013-01-06 12:27:53 -08:00
Matt Pharr
81dbd504aa Small fixes to eliminate compiler warnings when using clang 2013-01-06 12:10:54 -08:00
Matt Pharr
63dd7d9859 Fix build to work with LLVM top-of-tree again 2013-01-06 12:02:08 -08:00
Jean-Luc Duprat
2063d34f3e Merge pull request #414 from jbrodman/master
Fix to build with 3.2
2013-01-03 11:00:45 -08:00
james.brodman
83fdc2e5ad Fix to build with 3.2. LLVM API Change? 2013-01-03 13:43:47 -05:00
Peng Tu
6ba7368ab0 Fix two compile time errors to allow SOA pointer and array be passed as function argument. 2012-12-11 17:20:15 -08:00
Jean-Luc Duprat
c2805942a9 Merge pull request #409 from mmp/master
Bugfix for issue #408.
2012-12-06 09:46:34 -08:00
Matt Pharr
9892c8bf9a Fix logic for ordering of struct declarations in generated header files.
When a struct had an array of another struct type as a member, we weren't
detecting that the struct type in the array needed to be declared before the
enclosing struct type.

Fixes issue #408.
2012-12-06 11:39:22 -05:00
Matt Pharr
23e5877509 merge 2012-12-02 14:32:52 -08:00
Matt Pharr
8cbfde6092 Small fixes to build with LLVM top-of-tree (now numbered as version 3.3) 2012-12-02 14:29:24 -08:00
Jean-Luc Duprat
24087ff3cc Expose none() in the ISPC standard library.
On KNC: all(), any() and none() do not generate a redundant movmsk instruction.
2012-11-27 13:38:28 -08:00
Jean-Luc Duprat
6827001c1d Merge pull request #406 from pengtu/master
Fix ISPC with LLVM TOT build problem
2012-11-22 09:27:10 -08:00
Peng Tu
16b0806d40 Fix LLVM TOT build issue. 2012-11-21 19:09:10 -08:00
Jean-Luc Duprat
2129b1e27d knc.h: Fixed __rsqrt_varying_float() to use _mm512_invsqrt_ps() instead of _mm512_invsqrt_pd()
This was a typo.
2012-11-21 15:40:35 -08:00
Jean-Luc Duprat
a267762f59 Merge pull request #404 from mmp/master
Fix build with LLVM top-of-tree
2012-11-21 10:37:40 -08:00
Jean-Luc Duprat
65ca795030 Merge pull request #405 from jbrodman/master
Tweaked Scalar Repl of Aggregates Optimization
2012-11-19 13:12:26 -08:00
Matt Pharr
e82b649ec0 Fix build with LLVM top-of-tree (various changes to clang entrypoints). 2012-11-16 11:04:11 -08:00
james.brodman
275cdb1713 Merge branch 'master' of https://github.com/ispc/ispc 2012-11-14 13:30:45 -05:00
Jean-Luc Duprat
d3b86dcc90 KNC: fix implementation of __all() to use KNCni mask test instructions... 2012-11-14 09:24:01 -08:00
james.brodman
c736b75075 Merge branch 'master' of https://github.com/ispc/ispc 2012-11-13 17:08:09 -05:00
Jean-Luc Duprat
b601331362 Approximation for inverse sqrt and reciprocal provided in fast math mode.
RCP was actually slow in fast math mode
   Inverse sqrt did not expose fast approximation
2012-11-13 14:01:35 -08:00
ptu1
32d44a5b9e Merge branch 'master' of ssh://fmygit6001.fm.intel.com:29418/ssg_dpd_tpi_ispc-ispc_git 2012-11-13 12:47:13 -08:00
ptu1
810784da1f Set the ScalarReplAggregate maximum structure size based on target vector width. 2012-11-13 12:35:45 -08:00
james.brodman
d517b37f3f Merge branch 'master' of https://github.com/ispc/ispc 2012-11-09 10:14:18 -05:00
Jean-Luc Duprat
adeef0af01 Merge pull request #403 from jbrodman/master
Fixed =/== error for KNC intrinsic implementation of __all()
2012-11-08 13:57:42 -08:00
james.brodman
97ddc1ed10 Fixed =/== error in __all() 2012-11-08 16:30:12 -05:00
james.brodman
bf580648a1 Merge branch 'master' of https://github.com/ispc/ispc 2012-11-06 12:03:27 -05:00
Jean-Luc Duprat
ecc54fa0eb Merge pull request #402 from pengtu/master
Fix a bug where an unsigned index variable in subscript is sxt to 64 bit
2012-11-05 21:51:38 -08:00
Peng Tu
04d32ae3e6 Inside LLVM, both signed and unsigned integer are represented with the same type - i32 - effectively a signed int32. On 64 bit target, we must generate explicit sxt/zxt during the LLVM IR creation to promote the array index into 64 bit. Otherwise, an unsigned int index becomes signed int index in the LLVM IR.
I limit the fix to uniformed index to avoid widening a varying index vector to 64 bits.  This means that the 32 bit values in the varying indices must be positive and smaller than 2^31 at the runtime for a program to behave correctly.
2012-11-05 15:02:15 -08:00
james.brodman
a18b3ae88e Merge branch 'master' of https://github.com/ispc/ispc 2012-10-31 15:25:41 -04:00
james.brodman
e57801a5d1 Typo Fix 2012-10-31 15:25:26 -04:00
ingowald
da4390aede Merge pull request #401 from pengtu/master
Fix a "continue" handling bug in foreach_unique/foreach_active
2012-10-30 01:46:01 -07:00
Peng Tu
9e85667219 Merge remote branch 'upstream/master' 2012-10-29 22:51:22 -07:00
Peng Tu
b80867d473 Move the call to RestoreContinuedLanes from bbBody to the correct place of bbCheckForMore for foreach_unique and foreach_active. 2012-10-29 17:27:11 -07:00
Jean-Luc Duprat
d742dcce59 Merge pull request #400 from jbrodman/master
Fixes a compile error in examples/intrinsics/sse4.h. Assignment was used instead of equality comparison.
2012-10-26 14:08:56 -07:00
james.brodman
7a7af3d5f9 Merge branch 'master' of https://github.com/jbrodman/ispc 2012-10-26 16:55:53 -04:00
jbrodman
e323b1d0ad Fixed compile error: == instead of = 2012-10-26 16:55:28 -04:00
james.brodman
3c18c7a713 Fixed compile error: == instead of = 2012-10-26 16:52:54 -04:00
james.brodman
7c16292cb7 Merge branch 'master' of https://github.com/ispc/ispc 2012-10-24 13:49:04 -04:00
Gerrit Code Review
d665e2e85b Initial empty repository 2012-10-24 09:53:29 -07:00
Matt Pharr
172a189c6f Fix build with LLVM top-of-tree 2012-10-17 11:11:50 -07:00
Matt Pharr
406fbab40e Fix bugs in declarations of __any, __all, and __none in examples/intrinsics.
They return bool, not vector of bool.
2012-10-17 10:55:50 -07:00
Matt Pharr
09dc217f8c Fix hex constant in lParseInteger() (missing an f) 2012-10-16 06:03:33 -07:00
Matt Pharr
9002837750 Remove incorrect assert in tasksys.cpp 2012-10-15 10:43:46 -07:00
Matt Pharr
411d5b44ef Add ISPC_HAS_RAND definition on targets that have a HW RNG.
This lets us check for a functioning rdrand() call in the stdlib
more reliably.  Fixes issue #333.
2012-10-03 09:18:12 -07:00
Matt Pharr
360cc8044e Improve RNG documentation.
Issue #390.
2012-10-03 08:33:43 -07:00
Matt Pharr
ec2e9b5e79 Fix typo in assert() documentation.
Issue #388.
2012-10-03 08:26:38 -07:00
Matt Pharr
881dba61e4 Fix build with LLVM top-of-tree 2012-09-28 06:07:01 -07:00
Matt Pharr
6412876f64 Remove unused __reduce_add_uint{32,64} target functions.
The stdilb code just calls the signed int{32,64} functions,
which gives the right result for the unsigned case anyway.
The various targets didn't consistently define the unsigned
variants in any case.
2012-09-28 05:55:41 -07:00
Matt Pharr
538d51cbfe Add GMRES example 2012-09-20 14:06:55 -07:00
Jean-Luc Duprat
3dd9ff3d84 knc.h:
Properly pick up on ISPC_FORCE_ALIGNED_MEMORY when --opt=force-aligned-memory is used
	Fixed usage of loadunpack and packstore to use proper memory offset
	Fixed implementation of __masked_load_*() __masked_store_*() incorrectly (un)packing the lanes loaded
	Cleaned up usage of _mm512_undefined_*(), it is now mostly confined to constructor
	Minor cleanups

knc2x.h
	Fixed usage of loadunpack and packstore to use proper memory offset
	Fixed implementation of __masked_load_*() __masked_store_*() incorrectly (un)packing the lanes loaded
	Properly pick up on ISPC_FORCE_ALIGNED_MEMORY when --opt=force-aligned-memory is used
	__any() and __none() speedups.
	Cleaned up usage of _mm512_undefined_*(), it is now mostly confined to constructor
2012-09-19 17:11:04 -07:00
Ingo Wald
7f386923b0 Merge branch 'master' of https://github.com/ispc/ispc 2012-09-17 15:54:25 +02:00
Ingo Wald
d2312b1fbd now using the ASSUME_ALIGNED flag in knc.h 2012-09-17 15:54:00 +02:00
Ingo Wald
6655373ac3 commit test 2012-09-17 15:51:37 +02:00
Ingo Wald
d492af7bc0 64-bit gather/scatter, aligned load/store, i8 support 2012-09-17 03:39:02 +02:00
Matt Pharr
230a7b7374 Fix bug with floating-point constant zero vectors.
Issue #377.
2012-09-14 14:24:51 -07:00
Jean-Luc Duprat
4204a752f7 Merge branch 'master' of https://github.com/ispc/ispc 2012-09-14 14:12:49 -07:00
Jean-Luc Duprat
0e88d5f97f Fixed unaligned masked stores on KNC 2012-09-14 14:11:41 -07:00
Matt Pharr
a13e7f2435 #define ISPC_FORCE_ALIGNED_MEMORY, if appropriate, in C++ output. 2012-09-14 13:53:12 -07:00
Matt Pharr
be2108260e Add --opt=force-aligned-memory option.
This forces all vector loads/stores to be done assuming that the given
pointer is aligned to the vector size, thus allowing the use of sometimes
more-efficient instructions.  (If it isn't the case that the memory is
aligned, the program will fail!).
2012-09-14 13:49:45 -07:00
Matt Pharr
59b0a2b208 Mark __any(), __all(), and __none() as internal after they're linked in.
This fixes multiple symbol definition errors when compiling a single binary
for multiple ISA targets.
2012-09-14 13:32:42 -07:00
Matt Pharr
05a5a42a08 Don't force loads/stores from varying types to be unaligned.
These should always actually be aligned in memory.
2012-09-14 12:17:33 -07:00
Jean-Luc Duprat
f0b0618484 Added the following mask tests: __any(), __all(), __none() for all supported targets.
This allows for more efficient code generation of KNC.
2012-09-14 11:06:18 -07:00
Ingo Wald
4ecdbe4bd9 two changes:
- exported structs now get protected with #ifdef/#define blocks (allows including multiple ispc-generated header fiels into the same c source
- when creating offload stubs, encountering a 'export' function for which we cannot produce a stub will only trigger a warning, not an error.
2012-09-08 16:09:04 +02:00
Matt Pharr
9e9f266e52 Add files inadvertently missed in c58d92d46b.
Truly fixes issue #363.
2012-09-07 13:27:07 -07:00
Matt Pharr
0ce67f37ac Use LLVM_VERSION env variable to get LLVM version with MSVC build.
Previously, it was set directly in the ispc.vcxproj file.

Issue #371.
2012-09-06 06:04:32 -07:00
Matt Pharr
ddcd0a49ec Fix bugs with handling of 'continue' statements in foreach_* loops. 2012-09-05 10:16:58 -07:00
Matt Pharr
63b8fac852 Improve naming of temporary variable in IR 2012-09-05 10:13:45 -07:00
Matt Pharr
def8d7850b Fix crasher with malformed programs 2012-09-05 08:43:46 -07:00
Jean-Luc Duprat
0442efc856 Merge branch 'master' of https://github.com/ispc/ispc 2012-09-04 11:00:03 -07:00
Jean-Luc Duprat
f928bbb53c Updated usage of Initial Many Core Instructions (Intel® IMCI) instructions. 2012-09-04 10:57:25 -07:00
Jean-Luc Duprat
1ab7500dbb Updated user's guide to comply with Intel® Xeon Phi™ brand usage guidelines 2012-09-04 10:53:01 -07:00
Matt Pharr
c58d92d46b Issue error if a vector-typed parameter is used in an exported function.
Issue #363.
2012-08-31 06:59:58 -07:00
Matt Pharr
8276e912fd Switch to LLVM 3.1 for default for MSVC builds. Also fixes issue #374 2012-08-31 05:58:39 -07:00
Jean-Luc Duprat
e0490d0df5 Minor fixes needed for building on windows. 2012-08-30 10:56:13 -07:00
Jean-Luc Duprat
11db466a88 Implement the KNC prefetch API so that ISPC prefetch_*() stdlib functions may be used. 2012-08-30 10:24:31 -07:00
Matt Pharr
caaee0b666 Fix crash when using launch with non-task-qualified function 2012-08-29 09:06:47 -07:00
Matt Pharr
f2f470f369 Merge pull request #369 from jduprat/master
Task system updates
2012-08-28 14:01:37 -07:00
Jean-Luc Duprat
09bb36f58c Updated the task system in the example directory to support:
Cilk (cilk_for), OpenMP (#pragma omp parallel for), TBB(tbb::task_group and tbb::parallel_for)
as well as a new pthreads-based model that fully subscribes the machine (good for KNC).
With major contributions from Ingo Wald and James Brodman.
2012-08-28 11:13:12 -07:00
Matt Pharr
21719df6fd remove assert that hit with fast-math if user defined their own functions named rcp() 2012-08-21 16:39:36 -07:00
Matt Pharr
39329809dd fix crash with malformed program 2012-08-21 16:35:31 -07:00
Matt Pharr
44797e2925 remove incorrect assert 2012-08-21 16:27:49 -07:00
Jean-Luc Duprat
c8f373d119 Merge branch 'master' of https://github.com/ispc/ispc 2012-08-15 17:42:00 -07:00
Jean-Luc Duprat
8a22c63889 knc2x.h
Introduced knc2x.h which supprts 2x interleaved code generation for KNC (use the target generic-32).
This implementation is even more experimental and incomplete than knc.h but is useful already (mandelbrot works for example)

knc.h:
Switch to new intrinsic names _mm512_set_1to16_epi32() -> _mm512_set1_epi32(), etc...
Fix the declaration of the unspecialized template for __smear_*(), __setzero_*(), __undef_*()
Specifically mark _mm512_undefined_*() a few vectors in __load<>()
Fixed implementations of some implementations of __smear_*(), __setzero_*(), __undef_*() to remove unecessary dependent instructions.
Implemented ISPC reductions by simply calling existing intrinsic reductions, which are slightly more efficient than our precendent implementation.  Also added reductions for double types.
2012-08-15 17:41:10 -07:00
Matt Pharr
1a4434d314 Fix build with LLVM top-of-tree 2012-08-11 09:28:48 -07:00
Jean-Luc Duprat
165a13b13e knc.h:
vec16_i64 improved with the addition of the following: __extract_element(), insert_element(), __sub(), __mul(),
		   __sdiv(), __udiv(), __and(), __or(), __xor(), __shl(), __lshr(), __ashr(), __select()
	Fixed a bug in the __mul(__vec16_i64, __vec16_i32) implementation
	Constructors are all explicitly inlined, copy constructor and operator=() explicitly provided
	Load and stores for __vec16_i64 and __vec16_d use aligned instructions when possible
	__rotate_i32() now has a vector implementation
	Added several reductions: __reduce_add_i32(), __reduce_min_i32(), __reduce_max_i32(),
	       __reduce_add_f(), __reduce_min_f(), __reduce_max_f()
2012-08-10 12:20:10 -07:00
Matt Pharr
43364b2d69 Loosen tolerances to test passes with FMA on AVX2 2012-08-10 06:52:14 -07:00
Matt Pharr
6eaecd20d5 Mark __{get,set}_system_isa builtins as "internal" functions.
This ensures that they have static linkage, which in turn lets one
have multiple object files compiled to multiple targets without having
those cause link errors.

Issue #355.
2012-08-09 16:12:07 -07:00
Matt Pharr
c80bfeacf6 Fix crashes when input program tried to access undefined struct types.
(This in particular would happen when there was an error in the body of a struct
definition and we were left with an UndefinedStructType and then later tried to
do loads/stores from/to it.)

Issue #356.
2012-08-09 14:59:29 -07:00
Matt Pharr
2a19cc1758 Fix cases where we were trying to type cast instead of type convert.
Also, removed erroneous checks about the type of the test expression
in DoStmt and ForStmt.

These together were preventing conversion of pointer types to boolean
values, so things like "while (ptr)" would improperly not compile.

Issue #346.
2012-08-03 12:47:53 -07:00
Matt Pharr
8f5189f606 Type convert arrays in select expressions to pointers to the first element.
Fixes issue #345.
2012-08-03 11:53:59 -07:00
Matt Pharr
49dde7c6f2 Fix bug in declaration of double-precision sqrt intrinsic for AVX targets.
This was preventing sqrts of uniform double values from being compiled
properly.

Issue #344.
2012-08-03 11:43:31 -07:00
Matt Pharr
765a0d8896 Use puts() rather than printf() for printing assertion failure strings.
This way, we don't lose '%'s in the assertion strings.

Issue #342.
2012-08-03 11:31:38 -07:00
Matt Pharr
19d8f2e258 Generate FMA instructions with AVX2 (when possible).
Issue #320.
2012-08-03 10:43:41 -07:00
Matt Pharr
e6aec96e05 Fix build with LLVM top-of-tree 2012-08-03 09:59:41 -07:00
Jean-Luc Duprat
a2d42c3242 KNC: all masked_load_*() and masked_store_*() functions need to do unaligned accesses 2012-08-01 14:37:25 -07:00
Jean-Luc Duprat
52836aae87 Minor documentation clarrification on the impact of ICC -fp-model except option. 2012-08-01 10:24:35 -07:00
Matt Pharr
bda566d6a7 Fix incorrect assertion 2012-08-01 08:11:32 -07:00
Jean-Luc Duprat
63ed90b0fd docs/build.sh runs rst2html rather than rst2html.py
Explicitly documented that fact that ICC needs the -mmic flag to compile for KNC.
Updated ISPC User Guide with details on ICC compiler options that impact FP performance in generated code.
2012-07-30 11:47:25 -07:00
Matt Pharr
0bb4d282e2 Add sys/types.h include for linux/osx. 2012-07-23 08:32:41 -07:00
Matt Pharr
ae89a65dad Fix bug that caused unterminated basic blocks.
Issue #339.
2012-07-23 08:24:18 -07:00
Matt Pharr
e9fe9f5043 Add cpu strings for Ivy Bridge and HSW.
Default to avx2 ISA for HSW CPUs.
2012-07-23 08:24:18 -07:00
Matt Pharr
ce8dc5927c Fix bug in FunctionEmitContext::MatchIntegerTypes
Cause of issue #329.
2012-07-20 10:05:17 -07:00
Matt Pharr
f6989cce38 Disallow native output with generic targets, C++ output with non-generic targets.
Also wrote FAQs about why this is the way it is.
Issue #334.
2012-07-20 09:55:50 -07:00
Jean-Luc Duprat
6dbbf9aa80 Merge branch 'master' of https://github.com/ispc/ispc 2012-07-19 17:33:00 -07:00
Jean-Luc Duprat
fe6282e837 Fixed small issue with name mangling introduced in aecd6e08 2012-07-19 17:32:49 -07:00
Matt Pharr
51210a869b Support core-avx-i and core-avx2 CPU types.
(And map them to avx1.1 and avx2 targets, respectively.)
2012-07-19 10:15:59 -07:00
Matt Pharr
658652a9ff Merge pull request #331 from jduprat/master
New templated API for __setzero() __undef() and __smear()
2012-07-18 16:39:38 -07:00
Jean-Luc Duprat
aecd6e0878 All the smear(), setzero() and undef() APIs are now templated on the return type.
Modified ISPC's internal mangling to pass these through unchanged.
Tried hard to make sure this is not going to introduce an ABI change.
2012-07-17 17:06:36 -07:00
Jean-Luc Duprat
1334a84861 Merge branch 'master' of https://github.com/ispc/ispc 2012-07-17 11:46:30 -07:00
Matt Pharr
6a410fc30e Emit gather instructions for the AVX2 targets.
Issue #308.
2012-07-13 12:29:05 -07:00
Matt Pharr
984a68c3a9 Rename gen_gather() macro to gen_gather_factored() 2012-07-13 12:24:12 -07:00
Matt Pharr
daf5aa8e8b Run inst combine before memory optimizations.
We were previously emitting 64-bit indexing for some gathers where
32-bit was actually fine, due to some adds of constant vectors
that hadn't been simplified to the result.
2012-07-13 12:14:53 -07:00
Matt Pharr
98b2e0e426 Fixes for intrinsics unsupported in earlier LLVM versions.
Specifically, don't use the half/float conversion routines with
LLVM 3.0, and don't try to use RDRAND with anything before LLVM 3.2.
2012-07-13 12:14:10 -07:00
Matt Pharr
9a1932eaf7 Only set gcc's "-msse4.2", etc, option when compiling for generic targets.
We don't need it when ispc is just generating an object file directly, and gcc
on OS X doesn't recognize -mavx.
2012-07-13 12:02:05 -07:00
Matt Pharr
371d4be8ef Fix bugs in detection of Ivy Bridge systems.
We were incorrectly characterizing them as basic AVX1 without further
extensions, due to a bug in the logic to check CPU features.
2012-07-12 14:11:15 -07:00
Matt Pharr
d180031ef0 Add more tests of basic gather functionality. 2012-07-12 14:05:38 -07:00
Jean-Luc Duprat
e09e953bbb Added a few functions: __setzero_i64() __cast_sext(__vec16_i64, __vec16_i32), __cast_zext(__vec16_i32)
__min_varying_in32(), __min_varying_uint32(), __max_varying_int32(), __max_varying_uint32()
Fixed the signature of __smear_i64() to match current codegen
2012-07-12 10:32:38 -07:00
Matt Pharr
2c640f7e52 Add support for RDRAND in IvyBridge.
The standard library now provides a variety of rdrand() functions
that call out to RDRAND, when available.

Issue #263.
2012-07-12 06:07:07 -07:00
Matt Pharr
2bacebb1fb Doc fixes (Crystal Lemire). 2012-07-11 19:51:28 -07:00
Jean-Luc Duprat
df18b2a150 Fixed missing tmp var needed for use with gather intrinsic 2012-07-11 15:43:11 -07:00
Matt Pharr
216ac4b1a4 Stop factoring out constant offsets for gather/scatter if instr is available.
For KNC (gather/scatter), it's not helpful to factor base+offsets gathers
and scatters into base_ptr + {1/2/4/8} * varying_offsets + const_offsets.
Now, if a HW instruction is available for gather/scatter, we just factor
into base + {1/2/4/8} * offsets (if possible).  Not only is this simpler,
but it's also what we need to pass a value along to the scale by
2/4/8 available directly in those instructions.

Finishes issue #325.
2012-07-11 14:52:29 -07:00
Jean-Luc Duprat
898cded646 Merge branch 'master' of https://github.com/ispc/ispc
Conflicts:
	examples/intrinsics/knc.h
2012-07-11 14:45:00 -07:00
Matt Pharr
c09c87873e Whitespace / indentation fixes. 2012-07-11 14:29:46 -07:00
Matt Pharr
10b79fb41b Add support for non-factored variants of gather/scatter functions.
We now have two ways of approaching gather/scatters with a common base
pointer and with offset vectors.  For targets with native gather/scatter,
we just turn those into base + {1/2/4/8}*offsets.  For targets without,
we turn those into base + {1/2/4/8}*varying_offsets + const_offsets,
where const_offsets is a compile-time constant.

Infrastructure for issue #325.
2012-07-11 14:29:42 -07:00
Matt Pharr
ec0280be11 Rename gather/scatter_base_offsets functions to *factored_based_offsets*.
No functional change; just preparation for having a path that doesn't
factor the offsets into constant and varying parts, which will be better
for AVX2 and KNC.
2012-07-11 14:16:39 -07:00
Matt Pharr
8e19d54e75 Merge pull request #328 from jduprat/explicit_isa_in_tests
Explicit isa in tests
2012-07-10 20:49:37 -07:00
Jean-Luc Duprat
3c070e5e20 run_tests.py will only attempt to use the -mmic flag when the knc.h header is used 2012-07-10 17:07:56 -07:00
Jean-Luc Duprat
dde599f48f run_tests.py now picks the ISA via a -m flag based on the target selected, rather than always picking -msse4.2;
this is needed because -msse4.2 is not supported on KNC.
2012-07-10 16:39:18 -07:00
Jean-Luc Duprat
cc15ecfb3a Merge branch 'master' of https://github.com/ispc/ispc
Conflicts:
	cbackend.cpp
	examples/intrinsics/generic-16.h
	examples/intrinsics/generic-32.h
	examples/intrinsics/generic-64.h
	examples/intrinsics/knc.h
	examples/intrinsics/sse4.h
2012-07-10 16:36:08 -07:00
Jean-Luc Duprat
7a7c54bd59 Minor fixes to knc.h that resulted from integrating bea88ab122 2012-07-10 16:10:48 -07:00
Jean-Luc Duprat
bea88ab122 Integrated changes from mmp/and-fold-opt:
Add peephole optimization to eliminate some mask AND operations.

On KNC, the various vector comparison instructions can optionally
be masked; if a mask is provided, the result is effectively that
the value returned is the AND of the mask with the result of the
comparison.

This change adds an optimization pass to the C++ backend that looks
for vector ANDs where one operand is a comparison and rewrites
them--e.g. "and(equalfloat(a, b), c)" is changed to
"_equal_float_and_mask(a, b, c)", saving an instruction in the end.

Issue #319.

Merge commit '8ef6bc16364d4c08aa5972141748110160613087'

Conflicts:
	examples/intrinsics/knc.h
	examples/intrinsics/sse4.h
2012-07-10 10:33:24 -07:00
Matt Pharr
926b3b9ee3 Fix bugs with mask-handling for switch/do/for/while statements.
All of these pass the current mask to FunctionEmitContext::SetBlockEntryMask()
so that when a break/continue/return is encountered, it can test to see if all
lanes have followed that path and then return; this in turn ensures that we never
run statements with an all-off execution mask.

These functions were passing the function internal mask, not the full mask, and
thus could end up executing code with the mask all off if some lanes were
disabled by an outer function.  (The new tests test this case.)
2012-07-09 15:13:30 -07:00
Matt Pharr
bc7775aef2 Fix __ordered and _unordered floating point functions for C++ target.
Fixes include adding "_float" and "_double" suffixes as appropriate as well
as providing a number of missing implementations.

This fixes a number of failures in the half* tests.
2012-07-09 14:35:51 -07:00
Matt Pharr
107669686c Fix naming of some comparison ops in knc.h 2012-07-09 12:43:15 -07:00
Matt Pharr
bb11b3ab66 Fix build with LLVM 3.0 2012-07-09 10:45:36 -07:00
Jean-Luc Duprat
516ba85abd Merge pull request #322 from mmp/vector-constants
Vector constants
2012-07-09 09:28:26 -07:00
Jean-Luc Duprat
098277b4f0 Merge pull request #321 from mmp/setzero
More varied support for constant vectors from C++ backend.
2012-07-09 08:57:05 -07:00
Matt Pharr
950a989744 Add test that was supposed to go with 080241b7d1 2012-07-09 08:21:15 -07:00
Matt Pharr
fb8b893b10 Fix incorrect LLVM_3_1svn tests.
1. For some time now, we provide the version without the 'svn'
2. We should be testing "not LLVM 3.0" in these cases, since they
   apply to LLVM 3.2 and beyond as well...
2012-07-09 07:09:25 -07:00
Matt Pharr
9ca80debb8 Remove stale LLVM 2.9 support from builtins/util.m4 2012-07-09 06:54:29 -07:00
Matt Pharr
080241b7d1 Fix bugs with handling types of integer constants.
We now follow the rule that the type of an integer constant is
the first of int32, uint32, int64, uint64, that can hold the
value.  (Unless 'u' or 'l' suffixes have been provided.)

Fixes issue #299.
2012-07-08 08:43:03 -07:00
Matt Pharr
0d534720bb Fix bug with constant folding of select expressions.
We would sometimes pass an int32_t * to the ConstExpr constructor
but claim the underlying type was uint32, which made it grumpy.
2012-07-08 08:36:51 -07:00
Matt Pharr
1dc4424a30 Only override module datalayout for generic targets.
Doing it for all targets was causing a number of tests to fail.
(Actual root cause not determined.)
2012-07-07 15:12:50 -07:00
Matt Pharr
57f0cf30c0 Fix small typos in documentation. 2012-07-07 11:19:57 -07:00
Matt Pharr
8ef6bc1636 Add peephole optimization to eliminate some mask AND operations.
On KNC, the various vector comparison instructions can optionally
be masked; if a mask is provided, the result is effectively that
the value returned is the AND of the mask with the result of the
comparison.

This change adds an optimization pass to the C++ backend that looks
for vector ANDs where one operand is a comparison and rewrites
them--e.g. "__and(__equal_float(a, b), c)" is changed to
"__equal_float_and_mask(a, b, c)", saving an instruction in the end.

Issue #319.
2012-07-07 08:35:38 -07:00
Matt Pharr
974b40c8af Add type suffix to comparison ops in C++ output.
e.g. "__equal()" -> "__equal_float()", etc.

No functional change; this is necessary groundwork for a forthcoming
peephole optimization that eliminates ANDs of masks in some cases.
2012-07-07 07:50:59 -07:00
Matt Pharr
45e9e0be0b Map comparison predicates to strings for C++ output in a stand-alone function. 2012-07-06 16:00:09 -07:00
Matt Pharr
ec0918045d Issue error if compiling for multiple targets and program is coming from stdin.
We currently don't support this, so at least now we issue an intelligible error
message in this case.

Issue #269.
2012-07-06 13:21:53 -07:00
Matt Pharr
38bcecd2f3 Print a useful error if llvm-config isn't found when building.
Previously, there was a ton of unintelligible error spew.

Issue #273.
2012-07-06 13:18:11 -07:00
Matt Pharr
aabbdba068 Switch a few remaining fprintf() calls to use Warning()/Error(). 2012-07-06 12:56:45 -07:00
Matt Pharr
84c183da1f Issue error if a non "generic" target is used with C++ emission.
Issue #314.
2012-07-06 12:56:24 -07:00
Matt Pharr
b363b98211 Improve handling of datalayout for generic targets.
Flag 32-bit vector types as only requiring 32-bit alignment (preemptive
bug fix for 32xi1 vectors).

Force module datalayouts to be the same before linking them to silence
an LLVM warning.

Finishes issue #309.
2012-07-06 12:51:17 -07:00
Matt Pharr
8defbeb248 Handle llvm.objectsize intrinsic in C++ backend.
Partially addresses issue #309.
2012-07-06 12:29:23 -07:00
Matt Pharr
f52d227d80 Remove extra newline in error message 2012-07-06 11:31:29 -07:00
Matt Pharr
78cb45fb25 Improve error message with ambiguous function overloads.
Issue #316.
2012-07-06 11:25:57 -07:00
Matt Pharr
2d8026625b Always check the execution mask after break/continue/return.
When "break", "continue", or "return" is used under varying control flow,
we now always check the execution mask to see if all of the program
instances are executing it.  (Previously, this was only done with "cbreak",
"ccontinue", and "creturn", which are now deprecated.)

An important effect of this change is that it fixes a family of cases
where we could end up running with an "all off" execution mask, which isn't
supposed to happen, as it leads to all sorts of invalid behavior.

This change does cause the volume rendering example to run 9% slower, but
doesn't affect the other examples.

Issue #257.
2012-07-06 11:09:11 -07:00
Matt Pharr
73afab464f Provide mask at block entry for switch statements.
This fixes a crash if 'cbreak' was used in a 'switch'.  Renamed
FunctionEmitContext::SetLoopMask() to SetBlockEntryMask(), and
similarly the loopMask member variable.
2012-07-06 11:08:05 -07:00
Matt Pharr
8aa139b6be For C++ output, store constant vector values in local arrays.
When we have a constant vector of primitive types, we now generate
a definition of a static const array of the individual values.  This
in turn allows us to emit a simple aligned vector load to get the
constant vector value, rather than inefficiently inserting the values
into a vector.

Issue #318.
2012-07-06 08:57:09 -07:00
Matt Pharr
e5fe0eabdc Update __load() builtins to take const pointers. 2012-07-06 08:47:47 -07:00
Matt Pharr
0d3993fa25 More varied support for constant vectors from C++ backend.
If we have a vector of all zeros, a __setzero_* function call is emitted,
permitting calling specialized intrinsics for this.  Undefined values
are reflected with an __undef_* call, which similarly allows passing that
information along.

This change also includes a cleanup to the signature of the __smear_*
functions; since they already have different names depending on the
scalar value type, we don't need to use the trick of passing an
undefined value of the return vector type as the first parameter as
an indirect way to overload by return value.

Issue #317.
2012-07-05 20:19:11 -07:00
Jean-Luc Duprat
ac421f68e2 Ongoing support for int64 for KNC:
Fixes to __load and __store.
Added __add, __mul, __equal, __not_equal, __extract_elements, __smear_i64, __cast_sext, __cast_zext,
and __scatter_base_offsets32_float.

__rcp_varying_float now has a fast-math and full-precision implementation.
2012-07-05 17:05:42 -07:00
Jean-Luc Duprat
b9d1f0db18 Ongoing support for int64 for KNC:
Fixes to __load and __store.
Added __add, __mul, __equal, __not_equal, __extract_elements, __smear_i64, __cast_sext, __cast_zext,
and __scatter_base_offsets32_float.

__rcp_varying_float now has a fast-math and full-precision implementation.
2012-07-05 16:56:13 -07:00
Matt Pharr
6aad4c7a39 Bump version number to 1.3.1dev 2012-07-05 13:35:34 -07:00
Matt Pharr
4186ef204d Fix build with LLVM top of tree. 2012-07-05 13:35:01 -07:00
Matt Pharr
ae7a094ee0 Merge pull request #315 from NicolasT/master
Fix build on Fedora 17
2012-07-04 08:21:03 -07:00
Nicolas Trangez
3a007f939a Build: Include unistd.h where required
Some modules require an include of unistd.h (e.g. for getcwd and isatty
definitions).

These changes were required to build successfully on a Fedora 17 system,
using GCC 4.7.0 & glibc-headers 2.15.
2012-07-04 14:49:00 +02:00
Matt Pharr
b8503b9255 News and doxygen version number bump for 1.3.0 2012-06-29 08:38:38 -07:00
Matt Pharr
b7bc76d3cc Documentation updates for 1.3.0. 2012-06-29 08:35:29 -07:00
Matt Pharr
27d6c12972 Bump ISPC_MINOR_VERSION to 3 2012-06-28 16:15:46 -07:00
Matt Pharr
b69d783e09 Bump version to 1.3.0 2012-06-28 15:35:52 -07:00
Matt Pharr
3b2ff6301c Use fputs() rather than puts() for printing final result from print().
puts() sillily adds an undesired newline.
2012-06-28 12:29:40 -07:00
Matt Pharr
6c7043916e Silence bogus compiler warning 2012-06-28 12:11:56 -07:00
Matt Pharr
96a6e75b71 Fix issues with LLVM 3.0 and 3.1 build in cbackend.cpp
Should fix issue #312.
2012-06-28 12:11:27 -07:00
Matt Pharr
a91e4e7981 Fix missing ;s from 66d4c2ddd9 2012-06-28 12:04:58 -07:00
Jean-Luc Duprat
95d8f76ec3 Added prelimary support for Intel's Xeon Phi KNC processor.
float, int32 and double support is included; int8, int16 and int64
not supported yet.

This is work in progress and not considered stable yet.
2012-06-28 12:00:55 -07:00
Jean-Luc Duprat
66d4c2ddd9 When the --emit-c++ option is used, the state of the --opt=fast-math option is passed into the generated C++ code.
If --opt=fast-math is used then the generated code contains:
   #define ISPC_FAST_MATH 1
Otherwise it contains:
   #undef ISPC_FAST_MATH

This allows the generic headers to support the user's request.
2012-06-28 11:17:11 -07:00
Jean-Luc Duprat
8115ca739a Added prelimary support for Intel's Xeon Phi KNC processor.
float, int32 and double support is included; int8, int16 and int64 not supported yet.
This is work in progress and not considered stable yet.
2012-06-28 10:54:09 -07:00
Jean-Luc Duprat
ec4021bbf4 When the --emit-c++ option is used, the state of the --opt=fast-math option is passed into the generated C++ code.
If --opt=fast-math is used then the generated code contains:
   #define ISPC_FAST_MATH 1
Otherwise it contains:
   #undef ISPC_FAST_MATH

This allows the generic headers to support the user's request.
2012-06-28 10:42:29 -07:00
Jean-Luc Duprat
e431b07e04 Changed the C API to use templates to indicate memory alignment to the C compiler
This should help with performance of the generated code.
Updated the relevant header files (sse4.h, generic-16.h, generic-32.h, generic-64.h)

Updated generic-32.h and generic-64.h to the new memory API
2012-06-28 09:29:15 -07:00
Matt Pharr
d34a87404d Provide (undocumented for now) __pause() call to emit PAUSE inst. 2012-06-28 09:28:25 -07:00
Matt Pharr
f38770bf2a Fix build with LLVM ToT 2012-06-28 07:36:10 -07:00
Jean-Luc Duprat
dc9998ccaf Missed a few minor fixes to generic-64.h in previous commit 2012-06-27 17:14:03 -07:00
Jean-Luc Duprat
f1b3703389 Changed the C API to use templates to indicate memory alignment to the C compiler
This should help with performance of the generated code.
Updated the relevant header files (sse4.h, generic-16.h, generic-32.h, generic-64.h)

Updated generic-32.h and generic-64.h to the new memory API
2012-06-27 16:59:26 -07:00
Jean-Luc Duprat
b6a8d0ee7f Merge branch 'master' of git://github.com/ispc/ispc 2012-06-27 10:15:24 -07:00
Jean-Luc Duprat
2a4dff38d0 cbackend.cpp now makes explicit use of the llvm namespace
(Rather than implicitly with a using declaration.)  This will
allow for some further changes to ISPC's C backend, without collision
with ISPC's namespace. This change aims to have no effect on the code
generated by the compiler, it should be a big no-op; except for its
side-effects on maintainability.
2012-06-27 08:30:30 -07:00
Jean-Luc Duprat
665c564dcf cbackend.cpp now makes explicit use of the llvm namespace, rather than implicitly with a using declaration.
This will allow for some further changes to ISPC's C backend, without collision with ISPC's namespace.
This change aims to have no effect on the code generated by the compiler, it should be a big no-op; except
for its side-effects on maintainability.
2012-06-26 22:15:31 -07:00
Jean-Luc Duprat
ed71413e04 Merge branch 'master' of git://github.com/ispc/ispc 2012-06-26 14:32:27 -07:00
Jean-Luc Duprat
4b5e49b00b Merge branch 'master' of github.com:jduprat/ispc 2012-06-26 14:32:01 -07:00
Matt Pharr
f558ee788e Fix bug with generating implicit zero initializer values.
Issue #300.
2012-06-26 11:58:16 -07:00
Matt Pharr
ceb8ca680c Fix crash in codegen for assert() with malformed program.
Issue #302.
2012-06-26 11:54:55 -07:00
Matt Pharr
79ebcbec4b Fix crash in SwitchStmt::TypeCheck() with malformed programs. 2012-06-26 11:21:33 -07:00
Matt Pharr
2c7b650240 Add FAQ to explain how to launch per-instance tasks with foreach_active and unmasked.
Issue #227.
2012-06-22 14:32:05 -07:00
Matt Pharr
54459255d4 Add unmasked { } statement.
This reestablishes an "all on" execution mask for the gang, which can
be useful for nested parallelism..
2012-06-22 14:30:58 -07:00
Matt Pharr
b4a078e2f6 Add foreach_active iteration statement.
Issue #298.
2012-06-22 10:35:43 -07:00
Matt Pharr
ed13dd066b Distinguish between 'regular' foreach and foreach_unique in FunctionEmitContext
We need to do this since it's illegal to have nested foreach statements, but
nested foreach_unique, or foreach_unique inside foreach, etc., are all fine.
2012-06-22 06:04:00 -07:00
Matt Pharr
2b4a3b22bf Issue an error if the user has nested foreach statements.
Partially addresses issue #280.  (We should support them properly,
but at least now we don't silently generate incorrect code.)
2012-06-21 16:53:27 -07:00
Matt Pharr
8b891da628 Allow referring to the struct type being defined in its members.
It's now legal to write:

struct Foo { Foo *next; };

previously, a predeclaration "struct Foo;" was required.  This fixes
issue #287.

This change also fixes a bug where multiple forward declarations 
"struct Foo; struct Foo;" would incorrectly issue an error on the
second one.
2012-06-21 16:44:04 -07:00
Matt Pharr
5a2c8342eb Allow structs with no members.
Issue #289.
2012-06-21 16:07:31 -07:00
Matt Pharr
50eb4bf53a Change print() implementation to accumulate string locally before printing.
The string to be printed is accumulated into a local buffer before being sent to
puts().  This ensure that if multiple threads are running and printing at the
same time, their output won't be interleaved (across individual print statements--
it still may be interleaved across different print statements, just like in C).

Issue #293.
2012-06-21 14:41:53 -07:00
Matt Pharr
3c10ddd46a Fix declaration of size_t.
It should be an unsigned integer type.
2012-06-21 14:40:24 -07:00
Matt Pharr
0b7f9acc70 Align <16 x i1> vectors to just 16 bits for generic targets.
Partially addresses issue #259.
2012-06-21 10:25:33 -07:00
Matt Pharr
10fbaec247 Fix C++ output for unordered fp compares.
Fixes a bug introduced in 46716aada3.
2012-06-21 09:57:19 -07:00
Matt Pharr
007a734595 Add support for 'unmasked' function qualifier. 2012-06-20 15:36:00 -07:00
Matt Pharr
46716aada3 Switch to unordered floating point compares.
In particular, this gives us desired behavior for NaNs (all compares
involving a NaN evaluate to true).  This in turn allows writing the
canonical isnan() function as "v != v".

Added isnan() to the standard library as well.
2012-06-20 13:25:53 -07:00
Matt Pharr
3bc66136b2 Add foreach_unique iteration construct.
Idea via Ingo Wald / IVL compiler.
2012-06-20 10:04:24 -07:00
Matt Pharr
fae47e0dfc Update stdlib to not use "in" as a variable name.
Preparation for foreach_unique, which uses that as a keyword.
2012-06-20 10:04:24 -07:00
Matt Pharr
bd52e86486 Issue error on attempt to derefernce void pointer types.
Issue #288.
2012-06-18 19:51:19 -07:00
Matt Pharr
b2f6ed7209 Fix usage of CastType 2012-06-18 16:26:31 -07:00
Matt Pharr
4b334fd2e2 Fix linkage for programIndex et al. when not debugging.
We now use InternalLinkage for the 'programIndex' symbol (and similar)
if we're not compiling with debugging symbols.  This prevents those
symbol names/definitions from polluting the global namespace for
the common case.

Basically addresses Issue #274.
2012-06-15 11:50:16 -07:00
Matt Pharr
a23a7006e3 Don't issue error incorrectly with forward decl. of exported function.
Issue #281.
2012-06-15 10:54:50 -07:00
Matt Pharr
f47171a17c Don't check for "all off" mask at function entry.
We should never be running with an all off mask and thus should never
enter a function with an all off mask.  No performance change from
removing this, however.

Issue #282.
2012-06-15 10:14:53 -07:00
Matt Pharr
4945dc3682 Add contributors link to docs HTML templates 2012-06-13 06:11:08 -07:00
Matt Pharr
ada66b5313 Make more attempts to pull out constant offsets for gather/scatter.
The "base+offsets" variants of gather decompose the integer offsets into
compile-time constant and compile-time unknown elements.  (The coalescing
optimization, then, depends on this decomposition being done well--having
as much as possible in the constant component.)  We now make multiple
efforts to improve this decomposition as we run optimization passes; in
some cases we're able to move more over to the constant side than was
first possible.

This in particular fixes issue #276, a case where coalescing was expected
but didn't actually happen.
2012-06-12 16:21:14 -07:00
Matt Pharr
96450e17a3 Do all memory op improvements in a single optimization pass.
Rather than having separate passes to do conversion, when possible, of:

- General gather/scatter of a vector of pointers to g/s of
  a base pointer and integer offsets
- Gather/scatter to masked load/store, load+broadcast
- Masked load/store to regular load/store

Now all are done in a single ImproveMemoryOps pass.  This change was in
particular to address some phase ordering issues that showed up with
multidimensional array access wherein after determining that an outer
dimension had the same index value, we previously weren't able to take
advantage of the uniformity of the resulting pointer.
2012-06-12 13:56:17 -07:00
Matt Pharr
40a295e951 Fix bug where "avx-x2" target would cause AVX1.1 to be used. 2012-06-12 13:37:38 -07:00
Matt Pharr
d6c6f95373 Do all replacements of __pseudo* memory ops in a single optimization pass.
Collected the old PseudoGSToGSPass and PseudoMaskedStorePass into a single
pass, ReplacePseudoMemoryOpsPass, which handles both of their tasks.
2012-06-12 13:10:03 -07:00
Matt Pharr
19b46be20d Remove load_and_broadcast from built-ins.
Now that we never ever run with the mask all off, we no longer need
that logic in a built-in function so that we can check the mask.  In
the one place where it was used (turning gathers to the same location
into a load and broadcast), we now just emit the code for that
directly.
2012-06-12 12:30:57 -07:00
Ingo Wald
789e04ce90 Add support for host/device stub functions for offload. 2012-06-12 10:23:49 -07:00
Matt Pharr
dd4f0a600b Update AVX1.1 targets to not include declarations of half/float routines in bit code. 2012-06-08 15:57:36 -07:00
Matt Pharr
6c7df4cb6b Add initial support for "avx1.1" targets for Ivy Bridge.
So far, only the use of the float/half conversion instructions distinguishes
this from the "avx1" target.

Partial work on issue #263.
2012-06-08 15:55:00 -07:00
Matt Pharr
79e0a9f32a Fix codegen bug with foreach_tiled.
When the outermost dimension(s) were partially active, but the innermost
dimension was all on, we'd inadvertently use an incorrect "all on"
execution mask.

Fixes issues #177 and #200.
2012-06-08 14:56:18 -07:00
Matt Pharr
6c9bc63a1c Improve SourcePos reporting of the origin of the gather for gather warnings. 2012-06-08 13:33:11 -07:00
Matt Pharr
28a821df7d Improve wording of gather/scatter performance warnings. 2012-06-08 13:32:57 -07:00
Matt Pharr
27e39954d6 Fix a number of issues in examples/intrinsics/sse4.h.
This had gotten fairly out of date, after recent changes to C++ output.
Roughly 15 tests still fail with this target.

Issue #278.
2012-06-08 12:52:36 -07:00
Matt Pharr
e730a5364b Issue error if any complex assignment operator is used with a struct type.
Issue #275.
2012-06-08 11:29:02 -07:00
Matt Pharr
92b3ae41dd Don't print request to file bug on fatal error twice. 2012-06-08 11:23:45 -07:00
Matt Pharr
89a2566e01 Add separate variants of memory built-ins for floats and doubles.
Previously, we'd bitcast e.g. a vector of floats to a vector of i32s and then
use the i32 variant of masked_load/masked_store/gather/scatter.  Now, we have
separate float/double variants of each of those.
2012-06-07 14:47:16 -07:00
Matt Pharr
1ac3e03171 Gather/scatter function improvements in builtins.
More naming consistency: _i32 rather than i32, now.

Also improved the m4 macros to generate these sequences to not require as
many parameters.
2012-06-07 14:19:23 -07:00
Matt Pharr
b86d40091a Improve naming of masked load/store instructions in builtins.
Now, use _i32 suffixes, rather than _32, etc.  Also cleaned up the m4
macro to generate these functions, using WIDTH to get the target width,
etc.
2012-06-07 13:58:31 -07:00
Matt Pharr
91d22d150f Update load_and_broadcast built-in
Change function suffix to "_i32", etc, from "_32"

Improve load_and_broadcast macro in util.m4 to grab vector width from 
WIDTH variable rather than taking it as a parameter.
2012-06-07 13:33:17 -07:00
Matt Pharr
1d29991268 Indentation fixes in builtins/ 2012-06-07 13:23:07 -07:00
Matt Pharr
6f0a2686dc Use %a format for printf() for float constants on non-Windows platforms. 2012-06-07 13:20:03 -07:00
Matt Pharr
f06caabb07 Generate better code for break statements in varying loops (sometimes).
If we have a simple varying 'if' statement where the only code in the body is
a single 'break', then emit special case code that just updates the execution
mask directly.

Surprisingly, this leads to better generated code (e.g. Mandelbrot 7.1x on AVX
vs 5.8x before).  It's not clear why the general code generation path for
break doesn't generate the equivalent code; this topic should be investigated
further.  (Issue #277).
2012-06-06 11:08:42 -07:00
Matt Pharr
3c869802fb Always store multiply-used vector compares in temporary variables (C++ output). 2012-06-06 11:08:42 -07:00
Matt Pharr
7b6bd90903 Remove various equality checks between GetInternalMask() and LLVMMaskAllOn
These were never kicking in, since GetInternalMask() always loads from the
mask storage memory.
2012-06-06 11:08:42 -07:00
Matt Pharr
967bfa9c92 Silence compiler warning. 2012-06-06 08:08:55 -07:00
Matt Pharr
592affb984 Add experimental (and undocumented for now) export syntax.
This allows adding types to the list that are included in the automatically-generated
header files.

struct Foo { . . . };
struct Bar { . . . };

export { Foo, Bar };
2012-06-05 12:51:21 -07:00
Matt Pharr
96aaf6d53b Fix build with LLVM top of tree. 2012-06-05 12:28:05 -07:00
Matt Pharr
1397dbdabc Don't generate colorized output escapes when stderr isn't a TTY.
When piping to a pile, more/less, etc, this is generally undesirable.

This behavior can be overridden with the --colorized-output command-line
flag.
2012-06-04 09:20:57 -07:00
Matt Pharr
6118643232 Handle more error cases if the user tries to declare a method. 2012-06-04 09:07:13 -07:00
Matt Pharr
71198a0b54 Don't indent too much in errors/warnings if the filename is long. 2012-06-04 08:53:43 -07:00
Matt Pharr
22cb80399f Issue error if user tries to declare a method. 2012-06-04 08:50:13 -07:00
Jean-Luc Duprat
fa1fd8a576 Merged Upstream 2012-06-01 11:13:16 -07:00
Matt Pharr
6df7d31a5b Fix incorrect assertion.
Issue #272.
2012-05-30 16:34:59 -07:00
Matt Pharr
ef049e92ef Handle undefined struct types when generating headers. 2012-05-30 16:28:21 -07:00
Matt Pharr
fe8b109ca5 Fix more tests for 32 and 64-wide execution. 2012-05-30 13:06:07 -07:00
Matt Pharr
8fd9b84a80 Update seed_rng() in stdlib to take a varying seed.
Previously, we were trying to take a uniform seed and then shuffle that
around to initialize the state for each of the program instances.  This
was becoming increasingly untenable and brittle.

Now a varying seed is expected and used.
2012-05-30 10:35:41 -07:00
Matt Pharr
5cb53f52c3 Fix various tests/[frs]* files to be correct with 32 and 64-wide targets.
Still todo: tests/c*, tests/test-*
2012-05-30 10:31:12 -07:00
Matt Pharr
d86653668e Fix a number of tests to work correctly with 32/64-wide targets.
Still to be reviewed/fixed: tests/test-*, tests/[cfrs]*
2012-05-29 10:16:43 -07:00
Matt Pharr
5084712a15 Fix bugs in examples/intrinsics/generic-64.h
There were a number of situations where we were left-shifting 1 by a
lane index that were failing due to shifting beyond 32-bits.  Fixed
by shifting the 64-bit constant value 1ull.
2012-05-29 08:31:10 -07:00
Jean-Luc Duprat
ece65cab18 Fix some tests for up to 64-wide gangs 2012-05-29 07:52:50 -07:00
Matt Pharr
1f6075506c Fix linux build (Jean-Luc Duprat) 2012-05-28 19:45:16 -07:00
Matt Pharr
51ade48e3d Fix some of the reduce-* tests for 32 and 64-wide targets 2012-05-25 14:47:06 -07:00
Matt Pharr
21c43737fe Fix bug in examples/intrinsics/generic-32.h 2012-05-25 14:27:30 -07:00
Matt Pharr
6c7bcf00e7 Add examples/intrinsics/generic-64.h. 2012-05-25 14:27:19 -07:00
Matt Pharr
7a2142075c Add examples/intrinsics/generic-32.h implementation.
Roughly 100 tests fail with this; all the tests need to be audited
for assumptions that 16 is the widest width possible…
2012-05-25 12:37:59 -07:00
Matt Pharr
e8e9baa417 Update test_static.cpp to handle up to 64-wide 2012-05-25 12:14:58 -07:00
Matt Pharr
449d956966 Add support for generic-64 target. 2012-05-25 11:57:28 -07:00
Matt Pharr
90db01d038 Represent MOVMSK'ed masks with int64s rather than int32s.
This allows us to scale up to 64-wide execution.
2012-05-25 11:57:23 -07:00
Matt Pharr
38cea6dc71 Issue error if "typedef" is inadvertently included in function definition.
Issue #267.
2012-05-25 11:09:26 -07:00
Matt Pharr
64807dfb3b Add AssertPos() macro that provides rough source location in error
It can sometimes be useful to know the general place we were in the program
when an assertion hit; when the position is available / applicable, this
macro is now used.

Issue #268.
2012-05-25 10:59:45 -07:00
Matt Pharr
d943455e10 Issue error on overloaded "export"ed functions.
Issue #270.
2012-05-25 10:35:34 -07:00
Matt Pharr
fd03ba7586 Export reference parameters as C++ references, not pointers. 2012-05-24 07:12:48 -07:00
Matt Pharr
2c5a57e386 Fix bugs related to varying pointers to functions that return void. 2012-05-23 14:29:17 -07:00
Matt Pharr
e8858150cb Allow redundant semicolons at global scope. (Ingo Wald) 2012-05-23 14:20:20 -07:00
Matt Pharr
333f901187 Fix build with LLVM 3.2 dev top-of-tree 2012-05-23 14:19:50 -07:00
Matt Pharr
7dd4d6c75e Update for LLVM 3.2dev API change 2012-05-22 15:53:14 -07:00
Matt Pharr
99f57cfda6 Issue more sensible error message for varying pointers in exported functions. 2012-05-18 12:00:11 -07:00
Matt Pharr
4d1eb94dfd Fix bug in AddElementOffset() error checking. 2012-05-18 11:57:05 -07:00
Matt Pharr
22d584f302 Don't issue perf. warnings for various conversions with generic target. 2012-05-18 11:56:11 -07:00
Matt Pharr
72c41f104e Fix various malformed program crashes. 2012-05-18 10:44:45 -07:00
Matt Pharr
8d3ac3ac1e Fix build with LLVM ToT 2012-05-18 10:09:09 -07:00
Matt Pharr
299ae186f1 Expect support for half and transcendentals from all generic targets 2012-05-18 06:13:45 -07:00
Matt Pharr
f4df2fb176 Improvements to mask update code for generic targets.
Rather than XOR'ing with a temporary 'all-on' vector, we call
__not.  Also, we call out to __and_not1 and __and_not2, for an
AND where the first or second operand, respectively, has had
NOT applied to it.
2012-05-16 13:52:51 -07:00
Matt Pharr
625fbef613 Fix Windows build 2012-05-15 12:19:10 -07:00
Matt Pharr
fbed0ac56b Remove allOffMaskIsSafe from Target
The intent of this was to indicate whether it was safe to run code
with an 'all of' mask on the given target (and then sometimes be
more flexible about e.g. running both true and false blocks of if
statements, etc.)

The problem is that even if the architecture has full native mask support,
it's still not safe to run 'uniform' memory operations with the mask all
off.  Even more tricky, we sometimes transform masked varying memory operations
to uniform ones during optimization (e.g. gather->load and broadcast).

This fixes a number of the tests/switch-* tests that were failing on the
generic targets due to this issue.
2012-05-09 14:18:47 -07:00
Matt Pharr
dc120f3962 Fix regression in masked_store_blend for generic target.
In ee1fe3aa9f, the LLVM_VERSION define was updated to never
have the 'svn' suffix and the build was updated to handle LLVM
3.2.  This file had a check for LLVM_3_1svn that was no longer
hitting.

This fixes some issues with unnecessary loads and stores
in generated C++ code for the generic targets.
2012-05-09 14:18:47 -07:00
Matt Pharr
4f053e5b83 Pass OPT flags when linking 2012-05-08 13:25:09 -07:00
Matt Pharr
c6241581a0 Add an extra parameter to __smear functions to encode return type.
Now, the __smear* functions in generated C++ code have an unused first
parameter of the desired return type; this allows us to have headers
that include variants of __smear for multiple target widths.  (This
approach is necessary since we can't overload by return type in C++.)

Issue #256.
2012-05-08 09:54:23 -07:00
Nipunn Koorapati
041ade66d5 Placated compiler by initializing variable 2012-05-06 06:59:17 -07:00
Nipunn Koorapati
067a2949ba Added syntax highlighting for 'uniform' and 'varying' types. 2012-05-06 06:58:53 -07:00
Matt Pharr
55c754750e Remove a number of redundant/unneeded optimization passes.
Performance and code quality of performance suite is unchanged,
compilation times are improved by another 20% or so for simple
programs (e.g. rt.ispc).  One very complex programs compiles
about 2.4x faster now.
2012-05-05 15:47:24 -07:00
Matt Pharr
72b6c12856 Notify LLVM pass mgr that the MakeInternalFuncsStaticPass doesn't change the CFG. 2012-05-05 15:47:24 -07:00
Matt Pharr
15ea0af687 Add -f option to run_tests.py
This allows providing additional command-line arguments to ispc,
e.g. to force compilation with -O1, -g, etc.
2012-05-05 15:47:24 -07:00
Matt Pharr
ee7e367981 Do global dead code elimination early in optimization.
This gives a 15-20% speedup in compilation time for simple
programs (but only ~2% for the big 21k monster program).
2012-05-05 15:47:19 -07:00
Matt Pharr
8006589828 Use llvm::SmallVectors for struct member types and function types.
Further reduction of dynamic memory allocation...
2012-05-04 13:55:38 -07:00
Matt Pharr
413264eaae Make return values const &s to save copying. 2012-05-04 13:55:38 -07:00
Matt Pharr
7db8824da2 Reduce dynamic memory allocation in getting unif/varying variants of AtomicTypes 2012-05-04 13:55:38 -07:00
Matt Pharr
e1bc010bd1 More reduction of dynamic allocations in lDoTypeConv() 2012-05-04 13:55:38 -07:00
Matt Pharr
bff02017da Cache const/non-const variants of Atomic and ReferenceTypes.
More reduction of dynamic memory allocation.
2012-05-04 13:55:38 -07:00
Matt Pharr
c0019bd8e5 Cache type and lvalue type in IndexExpr and MemberExpr
This saves a bunch of redundant work and unnecessary duplicated
memory allocations.
2012-05-04 13:55:38 -07:00
Matt Pharr
e495ef2c48 Reduce dynamic memory allocation by reusing scope maps in symbol table. 2012-05-04 13:55:38 -07:00
Matt Pharr
78d62705cc Cache element types in StructType.
Previously, GetElementType() would end up causing dynamic allocation to
happen to compute the final element type (turning types with unbound
variability into the same type with the struct's variability) each it was
called, which was wasteful and slow.  Now we cache the result.

Another 20% perf on compiling that problematic program.
2012-05-04 13:55:38 -07:00
Matt Pharr
2791bd0015 Improve performance of lCheckTypeEquality()
We don't need to explicitly create the non-const Types to do type
comparison when ignoring const-ness in the check.

We can also save some unnecessary dynamic memory allocation by
keeping strings returned from GetStructName() as references to strings.

This gives another 10% on front-end perf on that big program.
2012-05-04 13:55:38 -07:00
Matt Pharr
7cf66eb61f Small optimizations to various AtomicType methods. 2012-05-04 13:55:38 -07:00
Matt Pharr
944c53bff1 Stop using dynamic_cast for Types.
We now have a set of template functions CastType<AtomicType>, etc., that in
turn use a new typeId field in each Type instance, allowing them to be inlined
and to be quite efficient.

This improves front-end performance for a particular large program by 28%.
2012-05-04 13:55:38 -07:00
Matt Pharr
c756c855ea Compile with -O2 by default on Linux/OSX. 2012-05-04 13:55:37 -07:00
Matt Pharr
58bb2826b2 Perf: cache connection between const/non-const struct variants.
In one very large program, we were spending quite a bit of time repeatedly
getting const variants of StructTypes.  This speeds up the front-end by
about 40% for that test case.

(This is something of a band-aid, pending uniquing types.)
2012-05-04 13:55:37 -07:00
Nipunn Koorapati
b7bef87a4d Added README for vim syntax highlighting. 2012-05-03 14:23:33 -07:00
Matt Pharr
0c1b206185 Pass log/exp/pow transcendentals through to targets that support them.
Currently, this is the generic targets.
2012-05-03 13:49:56 -07:00
Matt Pharr
7d7e99a92c Update ISPC_MINOR_VERSION to 2
(This should have been done with the 1.2.0 release!)
2012-05-03 12:04:24 -07:00
Matt Pharr
1ba8d7ef74 Fix test that had undefined behavior. 2012-05-03 11:11:21 -07:00
Matt Pharr
d99bd279e8 Add generic-32 target. 2012-05-03 11:11:06 -07:00
Matt Pharr
ee1fe3aa9f Update build to handle existence of LLVM 3.2 dev branch.
We now compile with LLVM 3.0, 3.1, and 3.2svn.
2012-05-03 08:25:25 -07:00
Matt Pharr
c4b1d79c5c When a function is defined, set its symbol's position to the code position.
Before, if the function was declared before being defined, then the symbol's
SourcePos would be left set to the position of the declaration.  This ended
up getting the debugging symbols mixed up in this case, which was undesirable.
2012-04-28 20:28:39 -07:00
Matt Pharr
a1a43cdfe0 Fix bug so that programIndex (et al.) are available in the debugger.
It's now possible to successfully print out the value of programIndex,
programCount, etc., in the debugger.  The issue was that they were
defined as having InternalLinkage, which meant that DCE removed them
at the end of compilation.  Now they're declared to have WeakODRLinkage,
which ensures that one copy survives (but there aren't multiply-defined
symbols when compiling multiple files.)
2012-04-28 17:12:57 -07:00
Matt Pharr
27b62781cc Fix bug in lStripUnusedDebugInfo().
This was causing an assert to hit in llvm's DwarfDebug.cpp.
2012-04-28 13:06:29 -10:00
Matt Pharr
0c5d7ff8f2 Add rygorous's float->srgb8 conversion routine to the stdlib.
Issue #230
2012-04-27 10:03:19 -10:00
Matt Pharr
0e2b315ded Add FAQ about foreach code generation.
(i.e. "why's there that extra stuff at the end and what can I do
about it if it's not necessary?)

Issue #231.
2012-04-27 09:35:37 -10:00
Matt Pharr
3e74d1c544 Fix documentation bug with typedef. 2012-04-25 17:15:20 -10:00
Matt Pharr
da690acce5 Fix build with LLVM 3.0 2012-04-25 14:27:33 -10:00
Matt Pharr
0baa2b484d Fix multiple bugs related to DIBuilder::createFunction() call.
The DIType passed to this method should correspond to the
FunctionType of the function, not its return type.

The first parameter should be the DIScope for the compile unit,
not the DIFile.

We previously had the unmangled function name and the mangled
function name interchanged.

The argument corresponding to "first line number of the function" was
missing, which in turn led to subsequent arguments being off, and thus
providing bogus values vs. what was supposed to be passed.

Rename FunctionEmitContext::diFunction to diSubprogram, to better
reflect its type.
2012-04-25 08:43:11 -10:00
Matt Pharr
260d7298c3 Strip unused debugging metadata after done with compilation.
Debugging information for functions that are inlined or static and
not used still hangs around after compilation; now we go through the
debugging info and remove the entries for any DISubprograms that
don't have their original functions left in the Module after
optimization.
2012-04-25 08:43:11 -10:00
Matt Pharr
d5cc2ad643 Call Verify() methods of various debugging llvm::DI* types after creation. 2012-04-25 08:43:11 -10:00
Matt Pharr
12706cd37f Debugging optimization pass updates
Don't run mem2reg with -O0 anymore, but do run the intrinsics opt pass, which
allows some CFG simplification due to the mask being all on, etc.
2012-04-25 08:43:11 -10:00
Matt Pharr
7167442d6e Debugging info: include parameter number for function params. 2012-04-25 08:43:11 -10:00
Matt Pharr
8547101c4b Debugging info: produce more descriptive producer string 2012-04-25 08:43:11 -10:00
Matt Pharr
5d58a9e4c2 Merge pull request #250 from jfpoole/master
Fix 32-bit samples on Mac OS X.
2012-04-23 17:12:46 -07:00
John Poole
cd98a29a4b Fix 32-bit samples on Mac OS X.
On Mac OS X and Linux rdtsc() didn't save and restore 32-bit registers.

This patch fixes issue #87.
2012-04-23 16:00:07 -07:00
Matt Pharr
903714fd40 Merge pull request #248 from nipunn1313/master
Goto with incorrect label now suggests labels based on string distance
2012-04-21 14:43:57 -07:00
Nipunn Koorapati
138c7acf22 Error() and Warning() functions for reporting compiler errors/warnings now respects newlines as part of valid error messages. 2012-04-21 01:44:10 -04:00
Matt Pharr
03b2b8ae8f Bump version number to 1.2.3dev 2012-04-20 14:31:46 -07:00
Matt Pharr
016b502d46 Update release notes for 1.2.2, bump version number in doxygen 2012-04-20 14:26:00 -07:00
Matt Pharr
c5f6653564 Bump version number to 1.2.2 2012-04-20 11:54:12 -07:00
Matt Pharr
cf9a4e209e Fix malformed program crash. 2012-04-20 11:53:43 -07:00
Nipunn Koorapati
040421942f Goto statements with a bad label produces error message.
Now it also produces a short list of suggestions based on string distance.
2012-04-20 14:42:14 -04:00
Matt Pharr
4dfc596d38 Fix MSVC warnings. 2012-04-20 10:50:39 -07:00
Matt Pharr
fe83ef7635 Merge pull request #247 from nipunn1313/master
Fixed compiler warning
2012-04-20 09:26:57 -07:00
Nipunn Koorapati
db8b08131f Fixed compile error which shows up on LLVM 3.0 2012-04-20 12:17:09 -04:00
Matt Pharr
32815e628d Improve naming of llvm Instructions created.
We now try harder to keep the names of instructions related to the
initial names of variables they're derived from and so forth.  This
is useful for making both LLVM IR as well as generated C++ code
easier to correlate back to the original ispc source code.

Issue #244.
2012-04-19 16:36:46 -07:00
Matt Pharr
71bdc67a45 Add LLVMGetName() utility routines.
Infrastructure for issue #244.
2012-04-19 16:24:40 -07:00
Matt Pharr
cb9f50ef63 C++ backend: mangle variable names less.
This makes the generated code a little easier to connect with the
original program.
2012-04-19 13:11:47 -07:00
Matt Pharr
12c754c92b Improved handling of splatted constant vectors in C++ backend.
Now, when we're printing out a constant vector value, we check to see
if it's a splat and call out to one of the __splat_* functions in
the generated code if to.
2012-04-19 13:11:15 -07:00
Matt Pharr
e4b3d03da5 When available, use ANSI escapes to colorize diagnostic output.
Issue #245.
2012-04-19 11:36:28 -07:00
Matt Pharr
cc26b66e99 Improve source position reporting for scatters.
Now, we only highlight the memory write--not both sides of the
assignment expression.
2012-04-19 11:23:20 -07:00
Matt Pharr
34d81fa522 Fix bugs in tests.
These two tests were walking past the end of the aFOO[] array, which
in turn was leading to failures with the generic-16/c++ output path.
2012-04-19 10:33:33 -07:00
Matt Pharr
49f1a5c2b3 Add print() statements to tests to indicate failure details.
These tests all fail with generic-16/c++ output currently; however, the
output indicates that it's just small floating-point differences.
(Though the question remains, why are those differences popping up?)
2012-04-19 10:32:55 -07:00
Matt Pharr
326c45fa17 Fix bugs in LLVMExtractFirstVectorElement().
When we're manually scalarizing the extraction of the first element
of a vector value, we need to be careful about handling constant values
and about where new instructions are inserted.  The old code was
sloppy about this, which in turn lead to invalid IR in some cases.
For example, the two bugs below were essentially due to generating
an extractelement inst from a zeroinitializer value and then inserting
it in the wrong bblock such that a phi node that used that value was
malformed.

Fixes issues #240 and #229.
2012-04-19 09:45:04 -07:00
Matt Pharr
a2bb899a6b Opt debug printing improvement
Now, just match the prefix of the provided function name of interest,
which allows us to not worry about managing details.
2012-04-19 09:34:54 -07:00
Matt Pharr
9fedb1674e Improve basic block dumping from optimization passes.
Now done via a macro, which is cleaner.  It's also now possible to
specify a single function to watch, which is useful for debugging.
2012-04-18 15:46:18 -07:00
Matt Pharr
7c91b01125 Handle more forms of constant vectors in lGetMask().
Various optimization passes depend on turning a compile-time constant
mask into a bit vector; it turns out that in LLVM3.1, constant vectors
of ints/floats are represented with llvM::ConstantDataVector, but
constant vectors of bools use llvm::ConstantVector (which is what LLVM
3.0 uses for all constant vectors).  Now lGetMask() always does the
llvm::ConstantVector path, to cover this case.

This improves generated C++ code by eliminating things like select
with an all on/off mask, turning movmask calls with constants into
constant values, etc.
2012-04-18 11:39:11 -07:00
Matt Pharr
c202e9e106 Add debugging printing code to optimization passes.
Now all of the passes dump out the basic block before and after
they do their thing when --debug is enabled.
2012-04-18 11:39:10 -07:00
Matt Pharr
645a8c9349 Fix serious bug in VSelMovmskOpt
When the mask was all off, we'd choose the incorrect operand!

(This bug was masked since this optimization wasn't triggering as
intended, due to other issues to be fixed in a forthcoming commit.
2012-04-18 11:39:10 -07:00
Jean-Luc Duprat
093fdcf3df Fixed bad integration 2012-04-18 09:39:54 -07:00
Jean-Luc Duprat
7abda5e8c2 Merge branch 'master' of git://github.com/ispc/ispc 2012-04-18 09:24:35 -07:00
Matt Pharr
abf7c423bb Fix build with LLVM 3.0 2012-04-18 06:14:55 -07:00
Matt Pharr
55d5c07d00 Issue errors when doing illegal things with incomplete struct types.
Issue an error, rather than crashing, if the user has declared a
struct type but not defined it and subsequently tries to:

- dynamically allocate an instance of the struct type
- do pointer math with a pointer to the struct type
- compute the size of the struct type
2012-04-18 06:08:05 -07:00
Jean-Luc Duprat
0a9b272fe4 Merge branch 'master' of git://github.com/ispc/ispc 2012-04-17 15:34:36 -07:00
Matt Pharr
b9d6ba2aa0 Always set target info, even when compiling to generic targets.
This allows the SROA pass eliminate a lot of allocas and loads and
stores, which helps a lot for performance.
2012-04-17 15:10:30 -07:00
Matt Pharr
a0c9f7823b C++ backend fixes.
Handle calls to llvm.trap()
Declare functions before globals
Handle memset()
2012-04-17 15:09:42 -07:00
Jean-Luc Duprat
4477a9c59a Merge branch 'master' of git://github.com/ispc/ispc
Conflicts:
	decl.cpp
2012-04-17 10:38:07 -07:00
Matt Pharr
99a27fe241 Add support for forward declarations of structures.
Now a declaration like 'struct Foo;' can be used to establish the
name of a struct type, without providing a definition.  One can
pass pointers to such types around the system, but can't do much
else with them (as in C/C++).

Issue #125.
2012-04-16 06:27:21 -07:00
Matt Pharr
fefa86e0cf Remove LLVM_TYPE_CONST #define / usage.
Now with LLVM 3.0 and beyond, types aren't const.
2012-04-15 20:11:27 -07:00
Matt Pharr
098c4910de Remove support for building with LLVM 2.9.
A forthcoming change uses some features of LLVM 3.0's new type
system, and it's not worth back-porting this to also all work
with LLVM 2.9.
2012-04-15 20:08:51 -07:00
Matt Pharr
17b7148300 Initial implementation of FunctionType::GetDIType 2012-04-13 19:50:45 -07:00
Matt Pharr
f4a2ef28e3 Fix crashes from malformed programs. 2012-04-13 19:42:07 -07:00
Matt Pharr
f0d013ee76 Fix incorrect assert. Issue #241 2012-04-12 20:19:41 -07:00
Matt Pharr
5ece6fec04 Substantial rewrite (again) of decl handling.
The decl.* code now no longer interacts with Symbols, but just returns
names, types, initializer expressions, etc., as needed.  This makes the
code a bit more understandable.

Fixes issues #171 and #130.
2012-04-12 17:28:30 -07:00
Matt Pharr
d88dbf3612 Fix two bugs with resolving unbound variability.
We still need to call ResolveUnboundVariability even if the
type returns false from HasUnboundVariability; we may have,
for example, a pointer type where the pointer is resolved,
but the pointed-to type is unresolved.

Fixes issue #228.
2012-04-12 11:40:28 -07:00
Matt Pharr
2a18efef82 Use type conversion machinery when processing expr lists for initializers.
Once we're down to something that's not another nested expr list, use 
TypeConvertExpr() to convert the expression to the type we need.  This should
allow simplifying a number of the GetConstant() implementations, to remove
partial reimplementation of type conversion there.

For now, this change finishes off issue #220.
2012-04-12 11:23:02 -07:00
Matt Pharr
fd846fbe77 Fix bug in __gather_base_offsets_32.
In short, we weren't correctly zeroing the compile-time constant portion
of the offsets for lanes that aren't executing. (!)

Fixes issue #235.
2012-04-12 10:28:15 -07:00
Matt Pharr
ca7cc4744e Fix bug with taking references of temporaries.
Previously, the compiler would crash if e.g. the program passed a
temporary value to a function taking a const reference.  This change
fixes ReferenceExpr::GetValue() to handle this case and allocate
temporary storage for the temporary so that the pointer to that
storage can be used for the reference value.
2012-04-12 06:08:19 -07:00
Matt Pharr
491fa239bd Add atomic swap and cmpxchg for void * as well.
Issue #232.
2012-04-11 06:12:31 -07:00
Matt Pharr
66765dc123 Fix printing of function overload candidates in error message. 2012-04-11 06:11:52 -07:00
Matt Pharr
70a5348f43 Add size_t, ptrdiff_t, and [u]intptr_t types. 2012-04-11 05:32:53 -07:00
Matt Pharr
2aa61007c6 Remove memory_barrier() calls from atomics.
This was unnecessary overhead to impose on all callers; the user
should handle these as needed on their own.

Also added some explanatory text to the documentation that highlights
that memory_barrier() is only needed across HW threads/cores, not
across program instances in a gang.
2012-04-10 19:37:03 -07:00
Matt Pharr
acfbe77ffc Fix typo. 2012-04-10 19:27:37 -07:00
Matt Pharr
08696653ca Don't include struct member types in mangled string.
Not only was this quite verbose, it was unnecessary since we do type
equality by name.  This also needed to be fixed before we could
handle structs declared like "struct Foo;", when we then e.g. have
other structs with Foo * members.
2012-04-10 19:27:31 -07:00
Matt Pharr
8a1a214ca9 Provide required alignment when generating debug info for pointer types. 2012-04-09 14:36:39 -07:00
Matt Pharr
7aaeb27e0f Remove duplicate test. 2012-04-09 14:23:17 -07:00
Matt Pharr
972043c146 Fix serious bug in handling constant-valued initializers.
In InitSymbol(), we try to be smart and emit a memcpy when there
are a number of values to store (e.g. for arrays, structs, etc.)

Unfortunately, this wasn't working as desired for bools (i.e. i1 types),
since the SizeOf() call that tried to figure out how many bytes to
copy would return 0 bytes, due to dividing the number of bits to copy
by 8.

Fixes issue #234.
2012-04-09 14:23:08 -07:00
Matt Pharr
8475dc082a Bump version number to 1.2.2dev 2012-04-06 16:16:50 -07:00
Matt Pharr
d0e583b29c Release notes and doxygen version nubmer bump for 1.2.1 2012-04-06 16:02:19 -07:00
Matt Pharr
c8feee238b Bump release number to 1.2.1 2012-04-06 15:30:54 -07:00
Matt Pharr
6712ecd928 Merge pull request #233 from nipunn1313/master
Ability to point build to custom version of llvm and clang
2012-04-06 15:24:12 -07:00
Nipunn Koorapati
d0c7b5d35c Merge remote-tracking branch 'upstream/master' 2012-04-06 17:58:21 -04:00
Nipunn Koorapati
802add1f97 Added to the Makefile the ability to point to a
custom installation of llvm and clang.
2012-04-06 17:54:55 -04:00
Matt Pharr
95556811fa Fix linux build 2012-04-05 20:39:39 -07:00
Matt Pharr
581472564d Print "friendly" ispc message when abort/seg fault signal is thrown.
Make crashes that happen in LLVM less inscrutable.

Issue #222.
2012-04-05 15:51:44 -07:00
Matt Pharr
c7dc8862a5 Add FAQs about various language details.
One of these finishes off issue #225.
2012-04-05 15:24:26 -07:00
Matt Pharr
4f8cf019ca Add pass to verify module before starting optimizations. 2012-04-05 08:49:39 -07:00
Matt Pharr
4c9ac7fcf1 Fix build with LLVM 2.9. 2012-04-05 08:22:40 -07:00
Matt Pharr
1dac05960a Fix build with LLVM 3.1 ToT 2012-04-05 08:17:56 -07:00
Matt Pharr
c27418da77 Add checks about references to non-lvalues.
Both ReturnStmt and DeclStmt now check the values being associated
with references to make sure that they are legal (e.g. it's illegal
to assign a varying lvalue, or a compile-time constant to a reference
type).  Previously we didn't catch this and would end up hitting
assertions in LLVM when code did this stuff.

Mostly fixes issue #225 (except for adding a FAQ about what this
error message means.)
2012-04-04 05:56:22 -07:00
Matt Pharr
637d076e99 Remove half/float conversion functions from AVX2 output.
(We were leaving around unused/unnecessary __half_to_float_uniform 
and the like, which in turn called out to the corresponding instruction.)
2012-04-03 12:18:38 -07:00
Matt Pharr
391678a5b3 Update function overload resolution logic.
Closer compatibility with C++: given a non-reference type, treat matching
to a non-const reference of that type as a better match than a const
reference of that type (rather than both being equal cost).

Issue #224.
2012-04-03 10:40:41 -07:00
Matt Pharr
4cd0cf1650 Revamp handling of function types, conversion to function ptr types.
Implicit conversion to function types is now a more standard part of
the type conversion infrastructure, rather than special cases of things
like FunctionSymbolExpr immediately returning a pointer type, etc.

Improved AddressOfExpr::TypeCheck() to actually issue errors in cases
where it's illegal to take the address of an expression.

Added AddressOfExpr::GetConstant() implementation that handles taking
the address of functions.

Issue #223.
2012-04-03 10:09:07 -07:00
Matt Pharr
b813452d33 Don't issue a slew of warnings if a bogus cpu type is specified.
Issue #221.
2012-04-03 06:13:28 -07:00
Matt Pharr
eb85da81e1 Further improvements to error reporting with function types.
Issue #219.
2012-04-03 05:55:50 -07:00
Matt Pharr
920cf63201 Improve error message about incompatible function types.
When reporting that a function has illegally been overloaded only
by return type, include "task", "export", and "extern "C"", as appropriate
in the error message to make clear what the issue is.

Finishes issue #216.
2012-04-03 05:43:23 -07:00
Matt Pharr
dc09d46bf4 Don't emit type declarations for extern'ed globals in generated header files.
This actually wasn't a good idea, since we'd like ispc programs to be able to
have varying globals that it uses internally among ispc code, without having
errors about varying globals when generating headers.

Issue #214.
2012-04-03 05:36:21 -07:00
Matt Pharr
05d1b06eeb Fixes to get the C++ backend more working again. 2012-03-30 16:56:30 -07:00
Matt Pharr
c1661eb06b Allow calling GetAs{Non}ConstType() for FunctionTypes.
It's just a no-op, though, rather than an assertion failure as before.
2012-03-30 16:56:30 -07:00
Jean-Luc Duprat
e9626a1d10 Added macro PRId64 to opt.cpp for compilation on Windows 2012-03-30 16:56:30 -07:00
Matt Pharr
560bf5ca09 Updated logic for selecting target ISA when not specified.
Now, if the user specified a CPU then we base the ISA choice on that--only
if no CPU and no target is specified do we use the CPUID-based check to
pick a vector ISA.

Improvement to fix to #205.
2012-03-30 16:36:12 -07:00
Jean-Luc Duprat
512f8d8b60 Fixed binary AND to logical AND 2012-03-29 17:03:22 -07:00
Matt Pharr
87c8a89349 Make 'export' a type qualifier, not a storage class.
In particular, this makes it legal to do "extern export foo()", among
other things.

Partially addresses issue #216.
2012-03-29 13:16:55 -07:00
Matt Pharr
255791f18e Fix to get correct variable names for extern globals that are later defined. 2012-03-29 11:50:15 -07:00
Matt Pharr
d5e3416e8e Fix bug in default argument handling introduced in 540fc6c2f3 2012-03-28 14:29:58 -07:00
Matt Pharr
5b2d43f665 Fix global variable code to correctly handle extern declarations.
When we have an "extern" global, now we no longer inadvertently define
storage for it.  Further, we now successfully do define storage when we
encounter a definition following one or more extern declarations.

Issues #215 and #217.
2012-03-28 14:15:49 -07:00
Matt Pharr
540fc6c2f3 Fix bugs with default parameter values for pointer-typed function parameters.
In particular "void foo(int * ptr = NULL)" and the like work now.

Issue #197.
2012-03-28 11:51:56 -07:00
Matt Pharr
b3c5043dcc Don't enable llvm's UnsafeFPMath option when --opt=fast-math is supplied.
This was causing functions like round() to fail on SSE2, since it has code
that does:

    x += 0x1.0p23f;
    x -= 0x1.0p23f;

which was in turn being undesirably optimized away.

Fixes issue #211.
2012-03-28 10:26:39 -07:00
Matt Pharr
d0d9aae968 Fix parser so that spaces aren't needed around "..." in foreach statements.
Issue #207.
2012-03-28 10:10:51 -07:00
Matt Pharr
3270e2bf5a Call CPUID to more reliably detect level of SSE/AVX that the host supports.
Fixes, I hope, issue #205.
2012-03-28 09:20:06 -07:00
Matt Pharr
013a3e7567 Support concatenation of adjacent string literals in the parser.
Fixes issue #208.
2012-03-28 08:52:09 -07:00
Matt Pharr
8368ba8539 Add missing checks for NULL current basic block in stmt code.
Fixes crashes if, for example, these statement types appeared after early
returns in the middle of functions.
2012-03-28 08:48:33 -07:00
Matt Pharr
ca0310e335 Merge pull request #213 from nipunn1313/master
Fixed compiler warning in expression type caster
2012-03-28 06:41:00 -07:00
Nipunn Koorapati
4690a678c1 Added parentheses around a || b && c statement in TypeCastExpr
to placate the compiler warning and make the code easier to understand.
2012-03-28 02:44:28 -04:00
Matt Pharr
f8a39402a2 Implement new, simpler function overload resolution algorithm.
We now give each conversion a cost and then find the minimum sum
of costs for all of the possible overloads.

Fixes issue #194.
2012-03-27 13:25:11 -07:00
Jean-Luc Duprat
b923e4daea Added macro PRId64 to opt.cpp for compilation on Windows 2012-03-27 12:46:59 -07:00
Matt Pharr
247775d1ec Fix type conversion to allow array -> void * conversions.
Fixes issue #193.
2012-03-27 10:07:54 -07:00
Matt Pharr
6e9fea377d Type convert NULL to other pointer types for function call arguments.
Fixes issue #198.
2012-03-27 09:50:21 -07:00
Matt Pharr
ca5c65d032 Fix bugs where typecasting an expression to void would cause it to disappear.
This was obviously problematic in cases where the expression was a function
call or the like, with side effects.

Fixes issue #199.
2012-03-27 09:33:43 -07:00
Matt Pharr
f9dc621ebe Fix bug when doing pointer math with varying integer offsets.
We were incorrectly trying to type convert the varying offset to a
uniform value, which in turn led to an incorrect compile-time error.

Fixes issue #201.
2012-03-27 09:17:40 -07:00
Matt Pharr
ffe484c31e Implement simpler approach for header file struct emission.
Rather than explicitly building a DAG and doing a topological sort,
just traverse structs recursively and emit declarations for all of
their dependent structs before emitting the original struct declaration.

Not only is this simpler than the previous implementation, but it
fixes a bug where we'd hit an assert if we had a struct with multiple
contained members of another struct type.
2012-03-27 09:06:10 -07:00
Matt Pharr
62cd3418ca Add test for the bug in issue #204. 2012-03-27 09:04:45 -07:00
Matt Pharr
d8a8f3a996 For symbols that are references, return uniform ptr type as lvalue type.
Fixes issue #204.
2012-03-27 08:52:14 -07:00
Matt Pharr
0ad8dbbfc9 Fix documentation bug: atan2 arguments were reversed.
Issue #203.
2012-03-27 08:03:02 -07:00
Matt Pharr
e15a1946c6 Documentation: add ISPC_TARGET_AVX2 as a possible target #define 2012-03-27 08:02:39 -07:00
Matt Pharr
8878826661 Add non-short-circuiting and(), or(), select() to stdlib. 2012-03-26 09:37:59 -07:00
Matt Pharr
95a8b6e5e8 Fix & vs. && in logical test.
Issue #196.
2012-03-25 17:38:34 -07:00
Matt Pharr
388d0d2cfd Add #include <string.h>
Fixes build on linux and windows.  (Strangely, this didn't break the
OSX build.)

Issue #195.
2012-03-25 17:38:15 -07:00
Matt Pharr
d3a374e71c Fix malformed program crasher. 2012-03-25 13:10:23 -07:00
Matt Pharr
1da2834b1e Allow the last member of a struct to be an unsized/zero-length array.
This enables the C truck of allocating a dynamic amount of storage for
the struct in order to extend out the array to the desired length.
2012-03-25 13:10:12 -07:00
Matt Pharr
ca3100874f Add FAQ about why varying values can't be passed to exported functions. 2012-03-25 11:35:28 -07:00
Matt Pharr
117f48a331 Don't include foreach stmts in cost estimates from EstimateCost().
Because they reestablish an 'all on' mask inside their body, it doesn't
make sense to include their cost when evaluating whether it's worth
re-establishing an 'all on' mask dynamically.  (This does mean that
EstimateCost()'s return value isn't the most obvious thing, but currently
in all the cases where we need it, this is the more appropriate value to
return.)
2012-03-25 10:32:44 -07:00
Matt Pharr
89bbceefee Make sure that foreach() statements never execute with an "all off" mask. 2012-03-25 10:07:12 -07:00
Matt Pharr
7e18f0e247 Small improvement to float->half function in stdlib.
Rewrite things to be able to do a float MINPS, for slightly
better code on SSE2 (which has that but not an signed int
min).  SSE2 code now 23 instructions (vs 21 intrinsics).
2012-03-23 16:09:32 -07:00
Jean-Luc Duprat
29c2f24faf Merge branch 'master' of git://github.com/ispc/ispc 2012-03-22 16:33:05 -07:00
Matt Pharr
3bb2dee275 Update float_to_half() with more efficient version from @rygorous 2012-03-22 13:36:26 -07:00
Matt Pharr
88cd5584e8 Add Debug() statement to report on if stmt cost/safety test results. 2012-03-22 13:36:26 -07:00
Jean-Luc Duprat
41f9ce2560 Merge branch 'master' of git://github.com/ispc/ispc 2012-03-22 10:02:05 -07:00
Matt Pharr
20044f5749 Distinguish between dereferencing pointers and references.
We now have separate Expr implementations for dereferencing pointers
and automatically dereferencing references.  This is in particular
necessary so that we can detect attempts to dereference references
with the '*' operator in programs and issue an error in that case.

Fixes issue #192.
2012-03-22 06:48:02 -07:00
Jean-Luc Duprat
833f0a6aa7 Merge branch 'master' of git://github.com/ispc/ispc 2012-03-21 17:07:18 -07:00
Matt Pharr
10c5ba140c Much more efficient half_to_float() code, via @rygorous.
Also, switch deferred shading example to use it. (Rather than
the "fast" half to float that doesn't handle deforms, etc.)
2012-03-21 16:13:04 -07:00
Matt Pharr
316de0b880 Make various Expr::EstimateCost() implementations return 0 if operand(s) are constants.
(Assume that constant folding will make these be free.)
2012-03-21 16:12:35 -07:00
Matt Pharr
989966f81b Annotate std lib functions with __declspec safe, cost, as appropriate. 2012-03-21 16:12:32 -07:00
Matt Pharr
ccd550dc52 __declspec support for function declarations.
safe: indicates that the function can safely be called with an "all off"
execution mask.

costN: (N an integer) overrides the cost estimate for the function with
the given value.
2012-03-21 16:11:50 -07:00
Matt Pharr
ddf350839a Add ability to parse __declspec lists to parser. 2012-03-21 16:11:50 -07:00
Matt Pharr
6a7dd2787a Fix bug in check for varying parameters in exported functions.
In particular, we weren't checking to see if the pointed-to type of
pointer parameters was varying.

Fixes issue #191.
2012-03-21 10:06:53 -07:00
Jean-Luc Duprat
385771e73e Merge branch 'master' of git://github.com/ispc/ispc 2012-03-20 13:31:31 -07:00
Matt Pharr
349ab0b9c5 Bump version number to 1.2.1dev 2012-03-20 12:46:23 -07:00
Matt Pharr
b5e6c6a2f3 update news to include paper 2012-03-20 12:05:23 -07:00
Matt Pharr
2832ea641f Release notes, bump doxygen version for 1.2.0 release 2012-03-20 11:58:39 -07:00
Matt Pharr
cb7edf2725 Set version to 1.2.0 for release builds 2012-03-20 11:13:50 -07:00
Matt Pharr
f1f1be2822 Remove twine op that caused crash on Windows, fix warning 2012-03-20 11:13:02 -07:00
Matt Pharr
7dffd65609 Add __foreach_active statement to loop over active prog. instances.
For now this has the __ prefix, as an experimental feature currently only
used in the standard library implementation.  It's probably worth making
something along these lines an official feature, but I'm not sure if this
in its current form is quite the right thing.
2012-03-20 08:46:00 -07:00
Matt Pharr
2c8a44e28b Merge pull request #189 from guanqun/fix-extern-c-error
calls to C/C++ functions should not be mangled.
2012-03-20 05:55:09 -07:00
Matt Pharr
39bb95a6ee Merge pull request #190 from guanqun/fix-output-option
fix --outfile option eror
2012-03-20 05:54:29 -07:00
Lu Guanqun
da9dba80a0 fix --outfile option eror 2012-03-20 09:44:49 +08:00
Lu Guanqun
12f3285f9b calls to C/C++ functions should not be mangled.
Otherwise, linker will never find the correct function.
2012-03-20 09:27:57 +08:00
Matt Pharr
7e954e4248 Don't issue gather/scatter warnigns in the 'extra' bits of foreach loops.
With AOS data, we can often coalesce the accesses into gathers for the main
part of foreach loops but only fail on the last bits where the mask is not
all on (since the coalescing code doesn't handle mixed masks, yet.) Before,
we'd report success with coalescing and then also report that gathers were needed
for the same accesses that were coalesced, which was a) confusing, and b)
didn't accurately represent what was going on for the majority of the loop
iterations.
2012-03-19 15:08:35 -07:00
Matt Pharr
d74cc6397b Fix significant bug in mask management in code generated for 'foreach'.
In particular, we 1. weren't setting the function mask to 'all on', such that
any mixed function mask would in turn apply inside the foreach loop, and 2.
weren't always setting the internal mask to 'all on' before doing any additional
masking based on the iteration variables.
2012-03-19 15:06:35 -07:00
Matt Pharr
777343331e Print numeric version number with --verison. 2012-03-19 14:41:25 -07:00
Matt Pharr
a062653743 Add patterns to better-match code generated when accessing SOA data.
In particular, LLVMVectorIsLinear() and LLVMVectorValuesAllEqual() are able
to reason a bit about the effects of the shifts and the ANDs that are
generated from SOA indexing calculations, so that they can detect more cases
where a linear sequence of locations are in fact being accessed in
the presence of SOA data.
2012-03-19 12:04:39 -07:00
Matt Pharr
57af0eb64f Still do the gather/scatter -> load store pass even if leaving 'pseudo' mem opts unchanged. 2012-03-19 12:04:38 -07:00
Matt Pharr
60aae16752 Move check for linear vector to LLVMVectorIsLinear() function. 2012-03-19 11:57:04 -07:00
Matt Pharr
e264d95019 LLVMVectorValuesAllEqual() improvements.
Clean up the API, so the caller doesn't have to pass in a vector so
the function can track PHI nodes (do that internally instead.)

Handle casts in lValuesAreEqual().
2012-03-19 11:54:18 -07:00
Matt Pharr
0664f5a724 Add LLVMExtractVectorInts() function, use it in the opt code. 2012-03-19 11:48:38 -07:00
Matt Pharr
17c6a19527 Add LLVMExtractFirstVectorElement() function (and use it).
For cases where it turns out that we just need the first element of
a vector (e.g. because we've determined that all of the values are
equal), it's often more efficient to only compute that one value
with scalar operations than to compute the whole vector's worth and
then just use one value.  This function tries to rewrite a vector
computation to the scalar equivalent, if possible.

(Partial work-around to http://llvm.org/bugs/show_bug.cgi?id=11775.)

Note that sometimes this is the wrong thing to do--if we need the entire
vector value for other purposes, for example.
2012-03-19 11:48:33 -07:00
Matt Pharr
cbc8b8259b Use LLVMIntAsType() in opt code instead of locally-defined equivalent. 2012-03-19 11:36:00 -07:00
Matt Pharr
1067a2e4be Add LLVMShuffleVectors() and LLVMConcatVectors() functions.
These were local functions in opt.cpp that are now public via the
llvmutil.* files.
2012-03-19 11:34:52 -07:00
Matt Pharr
74a031a759 Small improvements to debug info printing in opt.cpp 2012-03-19 11:32:08 -07:00
Matt Pharr
ee437193fb Add LLVMDumpValue() utility routine 2012-03-19 11:31:27 -07:00
Matt Pharr
436c53037e Fix assertion in FunctionEmitContext::storeUniformToSOA() 2012-03-19 11:29:14 -07:00
Matt Pharr
f55ba9d3cb Remove (highly verbose) Debug() call for type conversions. 2012-03-19 11:28:55 -07:00
Matt Pharr
8adb99b768 Improve source locations reported with warnings. 2012-03-19 11:28:34 -07:00
Matt Pharr
13c42412d2 Issue perf. warning if SOA width narrower than gang size is used. 2012-03-19 11:28:16 -07:00
Matt Pharr
75507d8b35 Remove error message if old 'reference' keyword is used. 2012-03-19 11:27:53 -07:00
Matt Pharr
ddfe4932ac Fix parsing of 'launch' so that angle brackets can be removed.
Issue #6.
2012-03-19 11:27:32 -07:00
Jean-Luc Duprat
cf208cc2e3 Merge branch 'master' of git://github.com/ispc/ispc 2012-03-17 12:59:43 -07:00
Matt Pharr
28ac016928 Fix bugs in checks for varying parameters in exported functions.
In short, we inadvertently weren't checking whether pointers themselves
were varying, which in turn led to an assertion later if an exported
function did have a varying parameter.

Issue #187.
2012-03-15 07:20:36 -05:00
Jean-Luc Duprat
f4ae41d006 Merge branch 'master' of git://github.com/ispc/ispc 2012-03-14 09:58:52 -07:00
Matt Pharr
9ec8e5a275 Fix compile warnings on Linux 2012-03-12 13:12:23 -07:00
Matt Pharr
a473046058 Once again fix for LLVM 3.1 TOT API changes 2012-03-11 15:04:26 -07:00
Matt Pharr
a69b7a5a01 Fix build with LLVM 3.1 TOT 2012-03-10 13:06:53 -08:00
Matt Pharr
640918bcc0 Call fclose() in deferred example. (Andy Zhang). 2012-03-07 08:50:10 -08:00
Matt Pharr
f39fbdb3fc Add various new functions to "internal" functions list.
Building with multiple compilation targets in a single binary was
broken due to multiple symbol definitions.
2012-03-05 16:41:20 -08:00
Matt Pharr
50d4d81062 Add file in docs/ for news page on website 2012-03-05 16:10:20 -08:00
Matt Pharr
3b95452481 Add memcpy(), memmove() and memset() to the standard library.
Issue #183.
2012-03-05 16:09:00 -08:00
Matt Pharr
c152ae3c32 Add single-precision asin() and acos() to stdlib.
Issue #184.
2012-03-05 13:32:13 -08:00
Matt Pharr
f6cbaa78e8 Update stdlib documentation to match recent pointed-to default variability changes 2012-03-05 13:32:12 -08:00
Matt Pharr
7adb250b59 Added tests and documentation for soa<> rate qualifier. 2012-03-05 09:58:10 -08:00
Matt Pharr
db5db5aefd Add native support for (AO)SOA data layout.
There's now a SOA variability class (in addition to uniform,
varying, and unbound variability); the SOA factor must be a
positive power of 2.

When applied to a type, the leaf elements of the type (i.e.
atomic types, pointer types, and enum types) are widened out
into arrays of the given SOA factor.  For example, given

struct Point { float x, y, z; };

Then "soa<8> Point" has a memory layout of "float x[8], y[8],
z[8]".

Furthermore, array indexing syntax has been augmented so that
when indexing into arrays of SOA-variability data, the two-stage
indexing (first into the array of soa<> elements and then into
the leaf arrays of SOA data) is performed automatically.
2012-03-05 09:58:10 -08:00
Matt Pharr
8fdf84de04 Disable debugging printing code. 2012-03-05 09:58:09 -08:00
Matt Pharr
ff5cbe80d1 Add more files to .gitignore 2012-03-05 09:58:09 -08:00
Matt Pharr
e013e0a374 Handle extract instructions in the lGetBasePtrAndOffsets() pattern matching code. 2012-03-05 09:58:09 -08:00
Matt Pharr
b7df312ca7 Small improvements to error location reporting, assertions in expr.cpp 2012-03-05 09:58:09 -08:00
Matt Pharr
ce82c3c0ae Return from function after storing initializer value. 2012-03-05 09:58:09 -08:00
Matt Pharr
2f958cfbda Fix cases where malformed program could cause crash. 2012-03-05 09:58:09 -08:00
Matt Pharr
8ef41dfd97 Represent variability with small helper class rather than an enum.
This provides part of the basis for representing SOA width in terms
of variability, but there should be no functional changes in this
checkin.
2012-03-05 09:58:09 -08:00
Matt Pharr
3082ea4765 Require Type::Equal() for all type equality comparisons.
Previously, we uniqued AtomicTypes, so that they could be compared
by pointer equality, but with forthcoming SOA variability changes,
this would become too unwieldy (lacking a more general / ubiquitous
type uniquing implementation.)
2012-03-05 09:58:09 -08:00
Matt Pharr
e482d29951 Add LLVM{U}IntAsType() utility routine 2012-03-05 09:58:09 -08:00
Matt Pharr
ff48dd7bfb Remove unused SOAArrayType class and Type::GetSOAType() methods. 2012-03-05 09:58:09 -08:00
Matt Pharr
7bf9c11822 Add uniform variants of RNG functions to stdlib 2012-03-05 09:56:30 -08:00
Matt Pharr
f7937f1e4b Fix build with LLVM2.9/3.0 2012-03-03 10:30:56 -08:00
Matt Pharr
0115eeabfe Update deferred example to take advantage of new pointer variability rules. 2012-02-29 14:27:53 -08:00
Matt Pharr
4b9c3ec0da Fix bug in StructType::GetElementType().
We were only resolving unbound variability for the top-level type,
which isn't enough if we have e.g. an unbound-variability pointer
pointing to some type with unbound variability.
2012-02-29 14:27:53 -08:00
Matt Pharr
55b81e35a7 Modify rules for default variability of pointed-to types.
Now, the pointed-to type is always uniform by default (if an explicit
rate qualifier isn't provided).  This rule is easier to remember and
seems to work well in more cases than the previous rule from 6d7ff7eba2.
2012-02-29 14:27:53 -08:00
Matt Pharr
2a1c7f2d47 Fix bug with indexing into varying pointer w/uniform index.
Issue #182.
2012-02-25 10:19:21 -08:00
Matt Pharr
8603f9838f Issue an error if "uniform" or "varying" qualifiers are applied to void types.
Issue #179.
2012-02-21 12:26:42 -08:00
Matt Pharr
95224f3f11 Improve detection of cases where 32-bit gather/scatter can be used.
Previously, we weren't noticing that an <n x i64> zero vector could
be represented as an <n x i32> without error.
2012-02-21 12:13:25 -08:00
Matt Pharr
f81acbfe80 Implement unbound varibility for struct types.
Now, if a struct member has an explicit 'uniform' or 'varying'
qualifier, then that member has that variability, regardless of
the variability of the struct's variability.  Members without
'uniform' or 'varying' have unbound variability, and in turn
inherit the variability of the struct.

As a result of this, now structs can properly be 'varying' by default,
just like all the other types, while still having sensible semantics.
2012-02-21 10:28:31 -08:00
Matt Pharr
6d7ff7eba2 Update defaults for variability of pointed-to types.
Now, if rate qualifiers aren't used to specify otherwise, varying
pointers point to uniform types by default.  As before, uniform
pointers point to varying types by default.

   float *foo;  // varying pointer to uniform float
   float * uniform foo;  // uniform pointer to varying float

These defaults seem to require the least amount of explicit
uniform/varying qualifiers for most common cases, though TBD if it
would be easier to have a single rule that e.g. the pointed-to type
is always uniform by default.
2012-02-21 06:27:34 -08:00
Matt Pharr
ad429db7e8 Generate more efficient code for variable initializers.
If the initializer is a compile-time constant (or at least a part of it
is), then store the constant value in a module-local constant global
value and then memcpy the value into the variable.  This, in turn,
turns into much better assembly in the end.

Issue #176.
2012-02-14 13:51:23 -08:00
Matt Pharr
4c07abbaf4 Support returning NULL pointer values from ConstExpr::GetConstant() 2012-02-14 13:49:18 -08:00
Matt Pharr
e3c0551129 Handle uniform short-vector types in ExprList::GetConstant() 2012-02-14 13:48:43 -08:00
Matt Pharr
8971baa42b Fix silly bug in ConstExpr::GetConstant() with enum types.
(They would be incorrectly matched as int8 types.)
2012-02-14 13:48:10 -08:00
Matt Pharr
317a1f51f7 Allow fewer initializer values in initializer expr lists than expected.
We now match C's behavior, where if we have an initializer list with
too-few values for the underlying type, any additional elements are
initialized to zero.

Fixes issue #123.
2012-02-14 13:47:11 -08:00
Matt Pharr
c63d139482 Add FunctionEmitContext::MemcpyInst() 2012-02-14 13:43:59 -08:00
Matt Pharr
9e682362e9 Fix bug in ArrayType::SizeUnsizedArrays().
If given an initializer list with too many elements for the actual array
size, in some cases we would incorrectly resize the explicitly sized array
to be the size implied by the initializer list.
2012-02-14 13:43:38 -08:00
Matt Pharr
56ec939692 Add perfbench to examples.sln for Windows 2012-02-14 10:07:08 -08:00
Matt Pharr
a86b942730 Fix cases in coalesce opt where offsets would be truncated to 32 bits 2012-02-14 10:05:07 -08:00
Matt Pharr
52eb4c6014 Fix warnings with Windows build 2012-02-14 10:01:45 -08:00
Matt Pharr
f4adbbf90c Merge a number of cbackend changes from the LLVM dev tree.
This fixes a number of failing tests with LLVM 3.1svn when
using the generic targets.

Issue #175.
2012-02-13 16:52:38 -08:00
Matt Pharr
cc86e4a7d2 Disable coalescing optimizations when using generic target.
The main issue is that they end up generating a number of smaller
vector ops (e.g. 4-wide and 8-wide on the 16-wide generic target,
which the examples/intrinsics implementations don't currently
support.

This fixes a number of failing tests for now; it may be worth
generalizing the stuff in examples/intrinsics at some point,
since as a general principle, e.g. if generating LLVM IR output,
the coalescing optimizations are still desirable.

Issue #175.
2012-02-13 16:52:01 -08:00
Matt Pharr
e864447e4a Fix silly bug in vector scale extraction optimization.
(Introduced in f20a2d2ee.  How did this ever pass tests?)
2012-02-13 12:06:45 -08:00
Matt Pharr
73bf552cd6 Add support for coalescing memory accesses from gathers.
There are two related optimizations that happen now.  (These
currently only apply for gathers where the mask is known to be
all on, and to gathers that are accessing 32-bit sized elements,
but both of these may be generalized in the future.)

First, for any single gather, we are now more flexible in mapping it
to individual memory operations.  Previously, we would only either map
it to a general gather (one scalar load per SIMD lane), or an 
unaligned vector load (if the program instances could be determined
to be accessing a sequential set of locations in memory.)

Now, we are able to break gathers into scalar, 2-wide (i.e. 64-bit),
4-wide, or 8-wide loads.  Further, we now generate code that shuffles
these loads around.  Doing fewer, larger loads in this manner, when
possible, can be more efficient.

Second, we can coalesce memory accesses across multiple gathers. If 
we have a series of gathers without any memory writes in the middle,
then we try to analyze their reads collectively and choose an efficient
set of loads for them.  Not only does this help if different gathers
reuse values from the same location in memory, but it's specifically
helpful when data with AOS layout is being accessed; in this case,
we're often able to generate wide vector loads and appropriate shuffles
automatically.
2012-02-10 13:10:39 -08:00
Matt Pharr
f20a2d2ee9 Generalize code to extract scales by 2/4/8 from addressing calculations.
Now, if we have a scale by 16, say, we extract out the scalar scale
of 8 and leave an explicit scale by 2.
2012-02-10 12:35:44 -08:00
Matt Pharr
0c25bc063c Add lGEPInst() utility routine to opt.cpp.
Deal with the messiness of LLVM API changes when creating
these in a single place.
2012-02-10 12:32:15 -08:00
Matt Pharr
db72781d2a Fix C++ backend to not assert with LLVM 3.1 svn builds. 2012-02-10 12:30:31 -08:00
Matt Pharr
0c8ad09040 Fix placement of ParserInit() call
This makes it possible to use fuzz testing even without --nostdlib!
2012-02-10 12:29:57 -08:00
Matt Pharr
49880ab761 Constant fold more cases in SelectExpr::Optimize()
Specifically, if both of the expressions are compile-time constants
and the condition is a varying compile-time constant (even if not 
all true or all false), then we can assemble a compile-time constant
result.
2012-02-10 12:28:54 -08:00
Matt Pharr
fe2d9aa600 Add perfbench to examples: a few small microbenchmarks. 2012-02-10 12:27:13 -08:00
Matt Pharr
1dead425e4 Don't indent *too* much on continued lines with warnings/errors. 2012-02-10 12:26:35 -08:00
Matt Pharr
adb1e47a59 Add FAQ about how to cross-inline ispc and C/C++ code. 2012-02-10 12:26:19 -08:00
Matt Pharr
ffba8580c1 Make sure that non-zero exit code is returned when input file not found.
Fixes issue #174.
2012-02-08 19:53:05 -08:00
Alex Reece
ea18427d29 Remove UnwindInst
Code no longer builds against head of LLVM branch after revision 149906
removed the unwind instruction.
2012-02-07 15:46:22 -08:00
Jean-Luc Duprat
97d42f5c53 Merge remote-tracking branch 'matt/master' 2012-02-07 12:50:31 -08:00
Matt Pharr
f3089df086 Improve error handling and reporting in the parser.
Add a number of additional error cases in the grammar.

Enable bison's extended error reporting, to get better messages about the
context of errors and the expected (but not found) tokens at errors.

Improve the printing of these by providing an implementation of yytnamerr
that rewrites things like "TOKEN_MUL_ASSIGN" to "*=" in error messages.

Print the source location (using Error() when yyerror() is called; wiring
this up seems to require no longer building a 'pure parser' but having
yylloc as a global, which in turn led to having to update all of the uses of
it (which previously accessed it as a pointer).

Updated a number of tests_errors for resulting changesin error text.
2012-02-07 11:13:32 -08:00
Matt Pharr
157e7c97ae Fix a variety of cases in the parser that could crash with malformed programs. 2012-02-07 11:08:00 -08:00
Matt Pharr
bb8e13e3c9 Add support for -I command-line argument to specify #include search directories. 2012-02-07 08:39:01 -08:00
Matt Pharr
5b4673e8eb Fix build with LLVM 2.9. 2012-02-07 08:37:13 -08:00
Matt Pharr
5b9de8cc07 Fix test to account for updated error message. 2012-02-07 08:36:56 -08:00
Matt Pharr
33ea934c8f Fix over-aggressive check in DereferenceExpr::TypeCheck()
(Reference types are allowed as well.)
2012-02-07 08:18:33 -08:00
Matt Pharr
6b3e14b0a4 Add command-line option to enable debugging output from parser. 2012-02-06 15:35:43 -08:00
Matt Pharr
098ceb5567 Issue error on attempted type convert from/to function type. 2012-02-06 15:35:43 -08:00
Matt Pharr
8e2b0632e8 Issue an error if an array of references is declared.
(More malformed program fixes.)
2012-02-06 15:35:43 -08:00
Matt Pharr
420d373d89 Move assert so that an error is issued for "break" outside of loops. 2012-02-06 15:35:43 -08:00
Matt Pharr
a59fd7eeb3 Fix a missing return value in the parser. 2012-02-06 15:35:43 -08:00
Matt Pharr
ee91fa1228 Make sure the program doesn't have a dereference of a non-pointer type. 2012-02-06 15:35:43 -08:00
Matt Pharr
a2b5ce0172 Add --help-dev option, only print developer options when it is used. 2012-02-06 15:35:43 -08:00
Matt Pharr
3efbc71a01 Add fuzz testing of input programs.
When the --fuzz-test command-line option is given, the input program
will be randomly perturbed by the lexer in an effort to trigger
assertions or crashes in the compiler (neither of which should ever
happen, even for malformed programs.)
2012-02-06 15:34:47 -08:00
Matt Pharr
b7c5af7e64 Prohibit returning functions from functions.
(Fix malformed program crasher)
2012-02-06 14:46:03 -08:00
Matt Pharr
f939015b97 Default to int32 for declarations without specified types.
(e.g. "uniform foo" == "uniform int32 foo")
2012-02-06 14:46:03 -08:00
Matt Pharr
a9ed71f553 Bug fixes to avoid NULL pointer derefs with malformed programs. 2012-02-06 14:45:58 -08:00
Matt Pharr
96a429694f 80 column fixes 2012-02-06 14:44:55 -08:00
Matt Pharr
fddc5e022e Fix typo in IfStmt::EstimateCost() 2012-02-06 14:44:54 -08:00
Matt Pharr
2236d53def Issue error if &=, |=, ^=, <<=, or >>= used with floats. 2012-02-06 14:44:54 -08:00
Matt Pharr
4e018d0a20 Improve tracking of source position in the presence of /* */ comments.
Don't let the preprocessor remove comments anymore, so that the rules
in lex.ll can handle them.  Fix lCComment() to update the source
position as it eats characters in comments.
2012-02-06 14:44:54 -08:00
Matt Pharr
977b983771 Issue error on "void" typed variable, function parameter, or struct member. 2012-02-06 14:44:48 -08:00
Matt Pharr
fa7a7fe23e Fix error handling in type code. 2012-02-06 12:39:14 -08:00
Matt Pharr
724a843bbd Add --quiet option to supress all diagnostic output 2012-02-06 12:39:09 -08:00
Jean-Luc Duprat
0db752f3a2 Merge branch 'master' of github.com:jduprat/ispc 2012-01-26 13:43:15 -08:00
Jean-Luc Duprat
ee8b6ebbf6 Merge remote-tracking branch 'matt/master' 2012-01-26 10:41:13 -08:00
Jean-Luc Duprat
f2b99ccb08 Made run_tests.py executable 2012-01-11 10:06:41 -08:00
546 changed files with 121755 additions and 10428 deletions

9
.gitignore vendored
View File

@@ -5,4 +5,11 @@ ispc
ispc_test
objs
docs/doxygen
docs/ispc.html
docs/*.html
tests*/*cpp
tests*/*run
examples/*/*.png
examples/*/*.ppm
examples/*/objs/*

View File

@@ -2,6 +2,15 @@
# ispc Makefile
#
# If you have your own special version of llvm and/or clang, change
# these variables to match.
LLVM_CONFIG=$(shell which llvm-config)
CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
# Add llvm bin to the path so any scripts run will go to the right llvm-config
LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
export PATH:=$(LLVM_BIN):$(PATH)
ARCH_OS = $(shell uname)
ifeq ($(ARCH_OS), Darwin)
ARCH_OS2 = "OSX"
@@ -10,27 +19,17 @@ else
endif
ARCH_TYPE = $(shell arch)
ifeq ($(shell llvm-config --version), 3.1svn)
LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker \
-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo \
-lLLVMBitWriter -lLLVMTableGen -lLLVMCBackendInfo \
-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG \
-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info \
-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler -lLLVMMCParser \
-lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMTransformUtils \
-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld \
-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore \
-lLLVMSupport
else
LLVM_LIBS=$(shell llvm-config --libs)
endif
LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs)
CLANG=clang
CLANG_LIBS = -lclangFrontend -lclangDriver \
-lclangSerialization -lclangParse -lclangSema \
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
ifneq ($(shell $(LLVM_CONFIG) --version), 3.0)
CLANG_LIBS += -lclangEdit
endif
ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
-lpthread
ifeq ($(ARCH_OS),Linux)
@@ -41,8 +40,8 @@ ifeq ($(ARCH_OS2),Msys)
ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
endif
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
LLVM_VERSION_DEF=-D$(LLVM_VERSION)
BUILD_DATE=$(shell date +%Y%m%d)
@@ -50,15 +49,16 @@ BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
CXX=g++
CPP=cpp
OPT=-g3
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
OPT=-O2
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \
-Wall $(LLVM_VERSION_DEF) \
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
LDFLAGS=
ifeq ($(ARCH_OS),Linux)
# try to link everything statically under Linux (including libstdc++) so
# that the binaries we generate will be portable across distributions...
LDFLAGS=-static
# LDFLAGS=-static
endif
LEX=flex
@@ -71,8 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
type.cpp util.cpp
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
opt.h stmt.h sym.h type.h util.h
TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
generic-16 generic-1
TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
builtins/dispatch.ll
BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
@@ -86,10 +86,10 @@ OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
default: ispc
.PHONY: dirs clean depend doxygen print_llvm_src
.PHONY: dirs clean depend doxygen print_llvm_src llvm_check
.PRECIOUS: objs/builtins-%.cpp
depend: $(CXX_SRC) $(HEADERS)
depend: llvm_check $(CXX_SRC) $(HEADERS)
@echo Updating dependencies
@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
@@ -99,7 +99,15 @@ dirs:
@echo Creating objs/ directory
@/bin/mkdir -p objs
print_llvm_src:
llvm_check:
@llvm-config --version > /dev/null || \
(echo; \
echo "******************************************"; \
echo "ERROR: llvm-config not found in your PATH"; \
echo "******************************************"; \
echo; exit 1)
print_llvm_src: llvm_check
@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
clean:
@@ -111,7 +119,7 @@ doxygen:
ispc: print_llvm_src dirs $(OBJS)
@echo Creating ispc executable
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
objs/%.o: %.cpp
@echo Compiling $<
@@ -121,6 +129,10 @@ objs/cbackend.o: cbackend.cpp
@echo Compiling $<
@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
objs/opt.o: opt.cpp
@echo Compiling $<
@$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $<
objs/%.o: objs/%.cpp
@echo Compiling $<
@$(CXX) $(CXXFLAGS) -o $@ -c $<

131
ast.cpp
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,10 @@
*/
/** @file ast.cpp
@brief
*/
@brief General functionality related to abstract syntax trees and
traversal of them.
*/
#include "ast.h"
#include "expr.h"
@@ -53,10 +55,10 @@ ASTNode::~ASTNode() {
// AST
void
AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
AST::AddFunction(Symbol *sym, Stmt *code) {
if (sym == NULL)
return;
functions.push_back(new Function(sym, args, code));
functions.push_back(new Function(sym, code));
}
@@ -90,6 +92,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
DoStmt *dos;
ForStmt *fs;
ForeachStmt *fes;
ForeachActiveStmt *fas;
ForeachUniqueStmt *fus;
CaseStmt *cs;
DefaultStmt *defs;
SwitchStmt *ss;
@@ -99,6 +103,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
PrintStmt *ps;
AssertStmt *as;
DeleteStmt *dels;
UnmaskedStmt *ums;
if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
@@ -135,6 +140,13 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
postFunc, data);
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
}
else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
}
else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
}
else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
@@ -151,7 +163,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
std::vector<Stmt *> &sls = sl->stmts;
for (unsigned int i = 0; i < sls.size(); ++i)
@@ -163,6 +175,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
else if ((ums = dynamic_cast<UnmaskedStmt *>(node)) != NULL)
ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
else
FATAL("Unhandled statement type in WalkAST()");
}
@@ -180,7 +194,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
MemberExpr *me;
TypeCastExpr *tce;
ReferenceExpr *re;
DereferenceExpr *dre;
PtrDerefExpr *ptrderef;
RefDerefExpr *refderef;
SizeOfExpr *soe;
AddressOfExpr *aoe;
NewExpr *newe;
@@ -221,8 +236,12 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
else if ((dre = dynamic_cast<DereferenceExpr *>(node)) != NULL)
dre->expr = (Expr *)WalkAST(dre->expr, preFunc, postFunc, data);
else if ((ptrderef = dynamic_cast<PtrDerefExpr *>(node)) != NULL)
ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
data);
else if ((refderef = dynamic_cast<RefDerefExpr *>(node)) != NULL)
refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
data);
else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
@@ -300,19 +319,39 @@ TypeCheck(Stmt *stmt) {
}
struct CostData {
CostData() { cost = foreachDepth = 0; }
int cost;
int foreachDepth;
};
static bool
lCostCallback(ASTNode *node, void *c) {
int *cost = (int *)c;
*cost += node->EstimateCost();
lCostCallbackPre(ASTNode *node, void *d) {
CostData *data = (CostData *)d;
if (dynamic_cast<ForeachStmt *>(node) != NULL)
++data->foreachDepth;
if (data->foreachDepth == 0)
data->cost += node->EstimateCost();
return true;
}
static ASTNode *
lCostCallbackPost(ASTNode *node, void *d) {
CostData *data = (CostData *)d;
if (dynamic_cast<ForeachStmt *>(node) != NULL)
--data->foreachDepth;
return node;
}
int
EstimateCost(ASTNode *root) {
int cost = 0;
WalkAST(root, lCostCallback, NULL, &cost);
return cost;
CostData data;
WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
return data.cost;
}
@@ -323,14 +362,22 @@ static bool
lCheckAllOffSafety(ASTNode *node, void *data) {
bool *okPtr = (bool *)data;
if (dynamic_cast<FunctionCallExpr *>(node) != NULL) {
// FIXME: If we could somehow determine that the function being
// called was safe (and all of the args Exprs were safe, then it'd
// be nice to be able to return true here. (Consider a call to
// e.g. floatbits() in the stdlib.) Unfortunately for now we just
// have to be conservative.
*okPtr = false;
return false;
FunctionCallExpr *fce;
if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
if (fce->func == NULL)
return false;
const Type *type = fce->func->GetType();
const PointerType *pt = CastType<PointerType>(type);
if (pt != NULL)
type = pt->GetBaseType();
const FunctionType *ftype = CastType<FunctionType>(type);
Assert(ftype != NULL);
if (ftype->isSafe == false) {
*okPtr = false;
return false;
}
}
if (dynamic_cast<AssertStmt *>(node) != NULL) {
@@ -350,17 +397,29 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
return false;
}
if (g->target.allOffMaskIsSafe == true)
// Don't worry about memory accesses if we have a target that can
// safely run them with the mask all off
return true;
if (dynamic_cast<ForeachStmt *>(node) != NULL ||
dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
dynamic_cast<ForeachUniqueStmt *>(node) != NULL ||
dynamic_cast<UnmaskedStmt *>(node) != NULL) {
// The various foreach statements also shouldn't be run with an
// all-off mask. Since they can re-establish an 'all on' mask,
// this would be pretty unintuitive. (More generally, it's
// possibly a little strange to allow foreach in the presence of
// any non-uniform control flow...)
//
// Similarly, the implementation of foreach_unique assumes as a
// precondition that the mask won't be all off going into it, so
// we'll enforce that here...
*okPtr = false;
return false;
}
IndexExpr *ie;
if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
const Type *type = ie->baseExpr->GetType();
if (type == NULL)
return true;
if (dynamic_cast<const ReferenceType *>(type) != NULL)
if (CastType<ReferenceType>(type) != NULL)
type = type->GetReferenceTarget();
ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
@@ -370,16 +429,14 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
return false;
}
const PointerType *pointerType =
dynamic_cast<const PointerType *>(type);
const PointerType *pointerType = CastType<PointerType>(type);
if (pointerType != NULL) {
// pointer[index] -> can't be sure -> not safe
*okPtr = false;
return false;
}
const SequentialType *seqType =
dynamic_cast<const SequentialType *>(type);
const SequentialType *seqType = CastType<SequentialType>(type);
Assert(seqType != NULL);
int nElements = seqType->GetElementCount();
if (nElements == 0) {
@@ -409,13 +466,9 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
return false;
}
DereferenceExpr *de;
if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
const Type *exprType = de->expr->GetType();
if (dynamic_cast<const PointerType *>(exprType) != NULL) {
*okPtr = false;
return false;
}
if (dynamic_cast<PtrDerefExpr *>(node) != NULL) {
*okPtr = false;
return false;
}
return true;

5
ast.h
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -84,8 +84,7 @@ class AST {
public:
/** Add the AST for a function described by the given declaration
information and source code. */
void AddFunction(Symbol *sym, const std::vector<Symbol *> &args,
Stmt *code);
void AddFunction(Symbol *sym, Stmt *code);
/** Generate LLVM IR for all of the functions into the current
module. */

View File

@@ -2,8 +2,8 @@
REM If LLVM_INSTALL_DIR isn't set globally in your environment,
REM it can be set here_
set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
set LLVM_VERSION=3.1svn
REM set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
REM set LLVM_VERSION=3.2
REM Both the LLVM binaries and python need to be in the path
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -47,12 +47,25 @@
#include <math.h>
#include <stdlib.h>
#include <llvm/LLVMContext.h>
#include <llvm/Module.h>
#include <llvm/Type.h>
#include <llvm/DerivedTypes.h>
#include <llvm/Instructions.h>
#include <llvm/Intrinsics.h>
#if defined(LLVM_3_2)
#include <llvm/Attributes.h>
#endif
#if defined(LLVM_3_1) || defined(LLVM_3_2)
#include <llvm/LLVMContext.h>
#include <llvm/Module.h>
#include <llvm/Type.h>
#include <llvm/Instructions.h>
#include <llvm/Intrinsics.h>
#include <llvm/DerivedTypes.h>
#else
#include <llvm/IR/Attributes.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Type.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/Intrinsics.h>
#include <llvm/IR/DerivedTypes.h>
#endif
#include <llvm/Linker.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/ADT/Triple.h>
@@ -157,7 +170,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
static void
lCreateSymbol(const std::string &name, const Type *returnType,
const std::vector<const Type *> &argTypes,
llvm::SmallVector<const Type *, 8> &argTypes,
const llvm::FunctionType *ftype, llvm::Function *func,
SymbolTable *symbolTable) {
SourcePos noPos;
@@ -199,7 +212,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
// bool, so just have a one-off override for that one...
if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
const Type *returnType = AtomicType::VaryingInt32;
std::vector<const Type *> argTypes;
llvm::SmallVector<const Type *, 8> argTypes;
argTypes.push_back(AtomicType::VaryingBool);
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
@@ -229,7 +242,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
// Iterate over the arguments and try to find their equivalent ispc
// types. Track if any of the arguments has an integer type.
bool anyIntArgs = false;
std::vector<const Type *> argTypes;
llvm::SmallVector<const Type *, 8> argTypes;
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
const llvm::Type *llvmArgType = ftype->getParamType(j);
const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
@@ -291,7 +304,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
Assert(id != 0);
LLVM_TYPE_CONST llvm::Type *intrinsicType =
llvm::Type *intrinsicType =
llvm::Intrinsic::getType(*g->ctx, id);
intrinsicType = llvm::PointerType::get(intrinsicType, 0);
Assert(func->getType() == intrinsicType);
@@ -322,6 +335,8 @@ lSetInternalFunctions(llvm::Module *module) {
"__add_varying_double",
"__add_varying_int32",
"__add_varying_int64",
"__all",
"__any",
"__aos_to_soa3_float",
"__aos_to_soa3_float16",
"__aos_to_soa3_float4",
@@ -411,12 +426,17 @@ lSetInternalFunctions(llvm::Module *module) {
"__extract_int64",
"__extract_int8",
"__fastmath",
"__float_to_half_uniform",
"__float_to_half_varying",
"__floatbits_uniform_int32",
"__floatbits_varying_int32",
"__floor_uniform_double",
"__floor_uniform_float",
"__floor_varying_double",
"__floor_varying_float",
"__get_system_isa",
"__half_to_float_uniform",
"__half_to_float_varying",
"__insert_int16",
"__insert_int32",
"__insert_int64",
@@ -438,6 +458,12 @@ lSetInternalFunctions(llvm::Module *module) {
"__max_varying_uint32",
"__max_varying_uint64",
"__memory_barrier",
"__memcpy32",
"__memcpy64",
"__memmove32",
"__memmove64",
"__memset32",
"__memset64",
"__min_uniform_double",
"__min_uniform_float",
"__min_uniform_int32",
@@ -454,9 +480,11 @@ lSetInternalFunctions(llvm::Module *module) {
"__new_uniform",
"__new_varying32",
"__new_varying64",
"__none",
"__num_cores",
"__packed_load_active",
"__packed_store_active",
"__pause",
"__popcnt_int32",
"__popcnt_int64",
"__prefetch_read_uniform_1",
@@ -465,12 +493,13 @@ lSetInternalFunctions(llvm::Module *module) {
"__prefetch_read_uniform_nt",
"__rcp_uniform_float",
"__rcp_varying_float",
"__rdrand_i16",
"__rdrand_i32",
"__rdrand_i64",
"__reduce_add_double",
"__reduce_add_float",
"__reduce_add_int32",
"__reduce_add_int64",
"__reduce_add_uint32",
"__reduce_add_uint64",
"__reduce_equal_double",
"__reduce_equal_float",
"__reduce_equal_int32",
@@ -499,6 +528,7 @@ lSetInternalFunctions(llvm::Module *module) {
"__round_varying_float",
"__rsqrt_uniform_float",
"__rsqrt_varying_float",
"__set_system_isa",
"__sext_uniform_bool",
"__sext_varying_bool",
"__shuffle2_double",
@@ -527,6 +557,8 @@ lSetInternalFunctions(llvm::Module *module) {
"__sqrt_uniform_float",
"__sqrt_varying_double",
"__sqrt_varying_float",
"__stdlib_acosf",
"__stdlib_asinf",
"__stdlib_atan",
"__stdlib_atan2",
"__stdlib_atan2f",
@@ -606,11 +638,18 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
mTriple.getVendor() == bcTriple.getVendor());
bcModule->setTargetTriple(mTriple.str());
// This is also suboptimal; LLVM issues a warning about linking
// modules with different datalayouts, due to things like
// bulitins-c.c having the regular IA layout, but the generic
// targets having a layout with 16-bit alignment for 16xi1 vectors.
// As long as builtins-c.c doesn't have any 16xi1 vector types
// (which it shouldn't!), then this override is safe.
if (g->target.isa == Target::GENERIC)
bcModule->setDataLayout(module->getDataLayout());
std::string(linkError);
if (llvm::Linker::LinkModules(module, bcModule,
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
llvm::Linker::DestroySource,
#endif // LLVM_3_0
&linkError))
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
lSetInternalFunctions(module);
@@ -627,15 +666,37 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
static void
lDefineConstantInt(const char *name, int val, llvm::Module *module,
SymbolTable *symbolTable) {
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
SC_STATIC);
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
Symbol *sym =
new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
SC_STATIC);
sym->constValue = new ConstExpr(sym->type, val, SourcePos());
llvm::Type *ltype = LLVMTypes::Int32Type;
llvm::Constant *linit = LLVMInt32(val);
pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
llvm::GlobalValue::InternalLinkage,
linit, pw->name.c_str());
symbolTable->AddVariable(pw);
// Use WeakODRLinkage rather than InternalLinkage so that a definition
// survives even if it's not used in the module, so that the symbol is
// there in the debugger.
llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
linit, name);
symbolTable->AddVariable(sym);
if (m->diBuilder != NULL) {
llvm::DIFile file;
llvm::DIType diType = sym->type->GetDIType(file);
Assert(diType.Verify());
// FIXME? DWARF says that this (and programIndex below) should
// have the DW_AT_artifical attribute. It's not clear if this
// matters for anything though.
llvm::DIGlobalVariable var =
m->diBuilder->createGlobalVariable(name,
file,
0 /* line */,
diType,
true /* static */,
sym->storagePtr);
Assert(var.Verify());
}
}
@@ -643,13 +704,17 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
static void
lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
SymbolTable *symbolTable) {
std::vector<const Type *> args;
llvm::SmallVector<const Type *, 8> args;
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
llvm::Function *func = module->getFunction(name);
Assert(func != NULL); // it should be declared already...
#if defined(LLVM_3_2)
func->addFnAttr(llvm::Attributes::AlwaysInline);
#else
func->addFnAttr(llvm::Attribute::AlwaysInline);
#endif
llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
@@ -661,20 +726,37 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
static void
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
Symbol *pidx = new Symbol("programIndex", SourcePos(),
AtomicType::VaryingConstInt32, SC_STATIC);
Symbol *sym =
new Symbol("programIndex", SourcePos(),
AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);
int pi[ISPC_MAX_NVEC];
for (int i = 0; i < g->target.vectorWidth; ++i)
pi[i] = i;
pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
sym->constValue = new ConstExpr(sym->type, pi, SourcePos());
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
llvm::Type *ltype = LLVMTypes::Int32VectorType;
llvm::Constant *linit = LLVMInt32Vector(pi);
pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
llvm::GlobalValue::InternalLinkage, linit,
pidx->name.c_str());
symbolTable->AddVariable(pidx);
// See comment in lDefineConstantInt() for why WeakODRLinkage is used here
llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
linit, sym->name.c_str());
symbolTable->AddVariable(sym);
if (m->diBuilder != NULL) {
llvm::DIFile file;
llvm::DIType diType = sym->type->GetDIType(file);
Assert(diType.Verify());
llvm::DIGlobalVariable var =
m->diBuilder->createGlobalVariable(sym->name.c_str(),
file,
0 /* line */,
diType,
false /* static */,
sym->storagePtr);
Assert(var.Verify());
}
}
@@ -756,6 +838,26 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
FATAL("logic error in DefineStdlib");
}
break;
case Target::AVX11:
switch (g->target.vectorWidth) {
case 8:
extern unsigned char builtins_bitcode_avx11[];
extern int builtins_bitcode_avx11_length;
AddBitcodeToModule(builtins_bitcode_avx11,
builtins_bitcode_avx11_length,
module, symbolTable);
break;
case 16:
extern unsigned char builtins_bitcode_avx11_x2[];
extern int builtins_bitcode_avx11_x2_length;
AddBitcodeToModule(builtins_bitcode_avx11_x2,
builtins_bitcode_avx11_x2_length,
module, symbolTable);
break;
default:
FATAL("logic error in DefineStdlib");
}
break;
case Target::AVX2:
switch (g->target.vectorWidth) {
case 8:
@@ -799,6 +901,20 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
builtins_bitcode_generic_16_length,
module, symbolTable);
break;
case 32:
extern unsigned char builtins_bitcode_generic_32[];
extern int builtins_bitcode_generic_32_length;
AddBitcodeToModule(builtins_bitcode_generic_32,
builtins_bitcode_generic_32_length,
module, symbolTable);
break;
case 64:
extern unsigned char builtins_bitcode_generic_64[];
extern int builtins_bitcode_generic_64_length;
AddBitcodeToModule(builtins_bitcode_generic_64,
builtins_bitcode_generic_64_length,
module, symbolTable);
break;
case 1:
extern unsigned char builtins_bitcode_generic_1[];
extern int builtins_bitcode_generic_1_length;
@@ -831,10 +947,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
symbolTable);
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
symbolTable);
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
symbolTable);
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
module, symbolTable);
lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
lDefineConstantInt("__have_native_half", g->target.hasHalf, module,
symbolTable);
lDefineConstantInt("__have_native_rand", g->target.hasRand, module,
symbolTable);
lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
module, symbolTable);
if (includeStdlibISPC) {

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -59,22 +59,39 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
typedef int Bool;
#define PRINT_SCALAR(fmt, type) \
printf(fmt, *((type *)ptr)); \
#define PRINT_BUF_SIZE 4096
#define APPEND(str) \
do { \
int offset = bufp - &printString[0]; \
*bufp = '\0'; \
strncat(bufp, str, PRINT_BUF_SIZE-offset); \
bufp += strlen(str); \
if (bufp >= &printString[PRINT_BUF_SIZE]) \
goto done; \
} while (0) /* eat semicolon */
#define PRINT_SCALAR(fmt, type) \
sprintf(tmpBuf, fmt, *((type *)ptr)); \
APPEND(tmpBuf); \
break
#define PRINT_VECTOR(fmt, type) \
putchar('['); \
*bufp++ = '['; \
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
for (int i = 0; i < width; ++i) { \
/* only print the value if the current lane is executing */ \
if (mask & (1<<i)) \
printf(fmt, ((type *)ptr)[i]); \
if (mask & (1ull<<i)) \
sprintf(tmpBuf, fmt, ((type *)ptr)[i]); \
else \
printf("((" fmt "))", ((type *)ptr)[i]); \
putchar(i != width-1 ? ',' : ']'); \
sprintf(tmpBuf, "((" fmt "))", ((type *)ptr)[i]); \
APPEND(tmpBuf); \
*bufp++ = (i != width-1 ? ',' : ']'); \
} \
break
@@ -89,16 +106,18 @@ typedef int Bool;
@param mask Current lane mask when the print statemnt is called
@param args Array of pointers to the values to be printed
*/
void __do_print(const char *format, const char *types, int width, int mask,
void __do_print(const char *format, const char *types, int width, uint64_t mask,
void **args) {
if (mask == 0)
return;
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
char *bufp = &printString[0];
char tmpBuf[256];
int argCount = 0;
while (*format) {
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
// Format strings are just single percent signs.
if (*format != '%')
putchar(*format);
if (*format != '%') {
*bufp++ = *format;
}
else {
if (*types) {
void *ptr = args[argCount++];
@@ -107,17 +126,22 @@ void __do_print(const char *format, const char *types, int width, int mask,
// printf() formatting string.
switch (*types) {
case 'b': {
printf("%s", *((Bool *)ptr) ? "true" : "false");
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
APPEND(tmpBuf);
break;
}
case 'B': {
putchar('[');
*bufp++ = '[';
if (bufp == &printString[PRINT_BUF_SIZE])
break;
for (int i = 0; i < width; ++i) {
if (mask & (1<<i))
printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
if (mask & (1ull << i)) {
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
APPEND(tmpBuf);
}
else
printf("_________");
putchar(i != width-1 ? ',' : ']');
APPEND("_________");
*bufp++ = (i != width-1) ? ',' : ']';
}
break;
}
@@ -136,14 +160,18 @@ void __do_print(const char *format, const char *types, int width, int mask,
case 'p': PRINT_SCALAR("%p", void *);
case 'P': PRINT_VECTOR("%p", void *);
default:
printf("UNKNOWN TYPE ");
putchar(*types);
APPEND("UNKNOWN TYPE ");
*bufp++ = *types;
}
++types;
}
}
++format;
}
done:
*bufp = '\0';
fputs(printString, stdout);
fflush(stdout);
}

View File

@@ -48,8 +48,8 @@ declare void @abort() noreturn
;; corresponding to one of the Target::ISA enumerant values that gives the
;; most capable ISA that the curremt system can run.
;;
;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
;; backwards compatibility for anyone building ispc with LLVM 2.9.
;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
;; backwards compatibility for anyone building ispc with LLVM 3.0
;;
;; #include <stdint.h>
;; #include <stdlib.h>
@@ -76,13 +76,19 @@ declare void @abort() noreturn
;; /* NOTE: the values returned below must be the same as the
;; corresponding enumerant values in Target::ISA. */
;; if ((info[2] & (1 << 28)) != 0) {
;; // AVX1 for sure. Do we have AVX2?
;; // Call cpuid with eax=7, ecx=0
;; __cpuid_count(info, 7, 0);
;; if ((info[1] & (1 << 5)) != 0)
;; return 3; // AVX2
;; else
;; return 2; // AVX1
;; if ((info[2] & (1 << 29)) != 0 && // F16C
;; (info[2] & (1 << 30)) != 0) { // RDRAND
;; // So far, so good. AVX2?
;; // Call cpuid with eax=7, ecx=0
;; int info2[4];
;; __cpuid_count(info2, 7, 0);
;; if ((info2[1] & (1 << 5)) != 0)
;; return 4;
;; else
;; return 3;
;; }
;; // Regular AVX
;; return 2;
;; }
;; else if ((info[2] & (1 << 19)) != 0)
;; return 1; // SSE4
@@ -92,41 +98,44 @@ declare void @abort() noreturn
;; abort();
;; }
%0 = type { i32, i32, i32, i32 }
define i32 @__get_system_isa() nounwind ssp {
define i32 @__get_system_isa() nounwind uwtable ssp {
entry:
%0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
%asmresult9.i = extractvalue %0 %0, 2
%asmresult10.i = extractvalue %0 %0, 3
%and = and i32 %asmresult9.i, 268435456
%0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
%asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
%asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
%and = and i32 %asmresult5.i, 268435456
%cmp = icmp eq i32 %and, 0
br i1 %cmp, label %if.else7, label %if.then
br i1 %cmp, label %if.else13, label %if.then
if.then: ; preds = %entry
%1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
%asmresult9.i24 = extractvalue %0 %1, 1
%and4 = lshr i32 %asmresult9.i24, 5
%2 = and i32 %and4, 1
%3 = or i32 %2, 2
%1 = and i32 %asmresult5.i, 1610612736
%2 = icmp eq i32 %1, 1610612736
br i1 %2, label %if.then7, label %return
if.then7: ; preds = %if.then
%3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
%asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
%and10 = lshr i32 %asmresult4.i28, 5
%4 = and i32 %and10, 1
%5 = add i32 %4, 3
br label %return
if.else7: ; preds = %entry
%and10 = and i32 %asmresult9.i, 524288
%cmp11 = icmp eq i32 %and10, 0
br i1 %cmp11, label %if.else13, label %return
if.else13: ; preds = %entry
%and15 = and i32 %asmresult5.i, 524288
%cmp16 = icmp eq i32 %and15, 0
br i1 %cmp16, label %if.else18, label %return
if.else13: ; preds = %if.else7
%and16 = and i32 %asmresult10.i, 67108864
%cmp17 = icmp eq i32 %and16, 0
br i1 %cmp17, label %if.else19, label %return
if.else18: ; preds = %if.else13
%and20 = and i32 %asmresult6.i, 67108864
%cmp21 = icmp eq i32 %and20, 0
br i1 %cmp21, label %if.else23, label %return
if.else19: ; preds = %if.else13
if.else23: ; preds = %if.else18
tail call void @abort() noreturn nounwind
unreachable
return: ; preds = %if.else13, %if.else7, %if.then
%retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
return: ; preds = %if.else18, %if.else13, %if.then7, %if.then
%retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
ret i32 %retval.0
}

View File

@@ -254,10 +254,10 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -158,13 +158,13 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
define <16 x float> @__max_varying_float(<16 x float>,
<16 x float>) nounwind readonly alwaysinline {
<16 x float>) nounwind readonly alwaysinline {
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
ret <16 x float> %call
}
define <16 x float> @__min_varying_float(<16 x float>,
<16 x float>) nounwind readonly alwaysinline {
<16 x float>) nounwind readonly alwaysinline {
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
ret <16 x float> %call
}
@@ -175,7 +175,7 @@ define <16 x float> @__min_varying_float(<16 x float>,
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <16 x i32> %0 to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -186,9 +186,57 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
%v1shift = shl i32 %v1, 8
%v = or i32 %v1shift, %v0
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <16 x i32> %0 to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
%v1shift = shl i32 %v1, 8
%v = or i32 %v1shift, %v0
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <16 x i32> %0 to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
%v1shift = shl i32 %v1, 8
%v = or i32 %v1shift, %v0
%cmp = icmp eq i32 %v, 65535
ret i1 %cmp
}
define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <16 x i32> %0 to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
%v1shift = shl i32 %v1, 8
%v = or i32 %v1shift, %v0
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops
@@ -224,7 +272,7 @@ reduce_equal(16)
;; horizontal int32 ops
define <16 x i32> @__add_varying_int32(<16 x i32>,
<16 x i32>) nounwind readnone alwaysinline {
<16 x i32>) nounwind readnone alwaysinline {
%s = add <16 x i32> %0, %1
ret <16 x i32> %s
}
@@ -252,11 +300,6 @@ define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint32 ops
define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
ret i32 %r
}
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
@@ -334,11 +377,6 @@ define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint64 ops
define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
ret i64 %r
}
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
@@ -352,19 +390,14 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(16, i8, 8)
load_and_broadcast(16, i16, 16)
load_and_broadcast(16, i32, 32)
load_and_broadcast(16, i64, 64)
; no masked load instruction for i8 and i16 types??
masked_load(16, i8, 8, 1)
masked_load(16, i16, 16, 2)
masked_load(i8, 1)
masked_load(i16, 2)
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
%floatmask = bitcast <16 x i32> %mask to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -382,7 +415,7 @@ define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
}
define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
; double up masks, bitcast to doubles
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -416,6 +449,7 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
ret <16 x i64> %val
}
masked_load_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
@@ -423,15 +457,15 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
; FIXME: there is no AVX instruction for these, but we could be clever
; by packing the bits down and setting the last 3/4 or half, respectively,
; of the mask to zero... Not sure if this would be a win in the end
gen_masked_store(16, i8, 8)
gen_masked_store(16, i16, 16)
gen_masked_store(i8)
gen_masked_store(i16)
; note that mask is the 2nd parameter, not the 3rd one!!
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
<16 x i32>) nounwind alwaysinline {
define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>,
<16 x i32>) nounwind alwaysinline {
%ptr = bitcast <16 x i32> * %0 to i8 *
%val = bitcast <16 x i32> %1 to <16 x float>
%mask = bitcast <16 x i32> %2 to <16 x float>
@@ -453,8 +487,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
ret void
}
define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
<16 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
<16 x i32> %mask) nounwind alwaysinline {
%ptr = bitcast <16 x i64> * %0 to i8 *
%val = bitcast <16 x i64> %1 to <16 x double>
@@ -492,14 +526,15 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
ret void
}
masked_store_float_double()
masked_store_blend_8_16_by_16()
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
<8 x float>) nounwind readnone
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
<16 x i32>) nounwind alwaysinline {
define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>,
<16 x i32>) nounwind alwaysinline {
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
%oldValue = load <16 x i32>* %0, align 4
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
@@ -536,8 +571,8 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
<4 x double>) nounwind readnone
define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
<16 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
<16 x i32> %mask) nounwind alwaysinline {
%oldValue = load <16 x i64>* %ptr, align 8
%old = bitcast <16 x i64> %oldValue to <16 x double>
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
@@ -597,10 +632,12 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatter
gen_scatter(16, i8)
gen_scatter(16, i16)
gen_scatter(16, i32)
gen_scatter(16, i64)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -158,13 +158,13 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
define <8 x float> @__max_varying_float(<8 x float>,
<8 x float>) nounwind readonly alwaysinline {
<8 x float>) nounwind readonly alwaysinline {
%call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
ret <8 x float> %call
}
define <8 x float> @__min_varying_float(<8 x float>,
<8 x float>) nounwind readonly alwaysinline {
<8 x float>) nounwind readonly alwaysinline {
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
ret <8 x float> %call
}
@@ -175,10 +175,32 @@ define <8 x float> @__min_varying_float(<8 x float>,
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 255
ret i1 %cmp
}
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -239,11 +261,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint32 ops
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
ret i32 %r
}
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
@@ -315,11 +332,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint64 ops
define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
%r = call i64 @__reduce_add_int64(<8 x i64> %v)
ret i64 %r
}
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
}
@@ -333,19 +345,15 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(8, i8, 8)
load_and_broadcast(8, i16, 16)
load_and_broadcast(8, i32, 32)
load_and_broadcast(8, i64, 64)
; no masked load instruction for i8 and i16 types??
masked_load(8, i8, 8, 1)
masked_load(8, i16, 16, 2)
masked_load(i8, 1)
masked_load(i16, 2)
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
%floatmask = bitcast <8 x i32> %mask to <8 x float>
%floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
%retval = bitcast <8 x float> %floatval to <8 x i32>
@@ -353,7 +361,7 @@ define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
}
define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
; double up masks, bitcast to doubles
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -372,19 +380,20 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
ret <8 x i64> %val
}
masked_load_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
gen_masked_store(8, i8, 8)
gen_masked_store(8, i16, 16)
gen_masked_store(i8)
gen_masked_store(i16)
; note that mask is the 2nd parameter, not the 3rd one!!
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32>) nounwind alwaysinline {
define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32>) nounwind alwaysinline {
%ptr = bitcast <8 x i32> * %0 to i8 *
%val = bitcast <8 x i32> %1 to <8 x float>
%mask = bitcast <8 x i32> %2 to <8 x float>
@@ -392,8 +401,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
ret void
}
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
<8 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
<8 x i32> %mask) nounwind alwaysinline {
%ptr = bitcast <8 x i64> * %0 to i8 *
%val = bitcast <8 x i64> %1 to <8 x double>
@@ -417,14 +426,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
}
masked_store_blend_8_16_by_8()
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
<8 x float>) nounwind readnone
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32>) nounwind alwaysinline {
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32>) nounwind alwaysinline {
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
%oldValue = load <8 x i32>* %0, align 4
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
@@ -438,8 +446,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
}
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<8 x i32> %i32mask) nounwind alwaysinline {
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<8 x i32> %i32mask) nounwind alwaysinline {
%oldValue = load <8 x i64>* %ptr, align 8
%mask = bitcast <8 x i32> %i32mask to <8 x float>
@@ -488,14 +496,17 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
ret void
}
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatter
gen_scatter(8, i8)
gen_scatter(8, i16)
gen_scatter(8, i32)
gen_scatter(8, i64)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt

View File

@@ -31,6 +31,8 @@
include(`target-avx-x2.ll')
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
@@ -61,17 +63,19 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
ifelse(NO_HALF_DECLARES, `1', `', `
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(16, i8)
gen_gather(16, i16)
gen_gather(16, i32)
gen_gather(16, i64)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)

View File

@@ -31,6 +31,8 @@
include(`target-avx.ll')
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
@@ -61,15 +63,19 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
ifelse(NO_HALF_DECLARES, `1', `', `
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(8, i8)
gen_gather(8, i16)
gen_gather(8, i32)
gen_gather(8, i64)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)

132
builtins/target-avx11-x2.ll Normal file
View File

@@ -0,0 +1,132 @@
;; Copyright (c) 2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(`target-avx-x2.ll')
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <16 x i32> %ret
}
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <16 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <16 x i32> %ret
}
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <16 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
ifelse(LLVM_VERSION, `LLVM_3_0', `
;; nothing to define...
', `
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %r
}
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %r
}
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
%r = extractelement <8 x float> %rv, i32 0
ret float %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
; round to nearest even
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
'
)

115
builtins/target-avx11.ll Normal file
View File

@@ -0,0 +1,115 @@
;; Copyright (c) 2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(`target-avx.ll')
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <8 x i32> %ret
}
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <8 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <8 x i32> %ret
}
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <8 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
ifelse(LLVM_VERSION, `LLVM_3_0', `
;; nothing to define...
', `
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
ret <8 x float> %r
}
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
ret <8 x i16> %r
}
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
%r = extractelement <8 x float> %rv, i32 0
ret float %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
; round to nearest even
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
')

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,16 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ifelse(LLVM_VERSION, `LLVM_3_0', `',
LLVM_VERSION, `LLVM_3_1', `',
`define(`HAVE_GATHER', `1')')
include(`target-avx-x2.ll')
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
@@ -66,6 +74,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
ifelse(LLVM_VERSION, `LLVM_3_0', `
;; nothing to define...
', `
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -116,14 +127,435 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(16, i8)
gen_gather(16, i16)
gen_gather(16, i32)
gen_gather(16, i64)
declare void @llvm.trap() noreturn nounwind
; $1: type
; $2: var base name
define(`extract_4s', `
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
')
; $1: type
; $2: var base name
define(`extract_8s', `
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
; $1: element type
; $2: ret name
; $3: v1
; $4: v2
define(`assemble_8s', `
%$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
; $1: element type
; $2: ret name
; $3: v1
; $4: v2
; $5: v3
; $6: v4
define(`assemble_4s', `
%$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
assemble_8s($1, $2, $2_1, $2_2)
')
ifelse(LLVM_VERSION, `LLVM_3_0', `
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)',
LLVM_VERSION, `LLVM_3_1', `
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)', `
gen_gather(i8)
gen_gather(i16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 gathers
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
extract_8s(i32, offsets)
extract_8s(i32, vecmask)
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
<8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
<8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
assemble_8s(i32, v, v1, v2)
ret <16 x i32> %v
}
define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
extract_4s(i32, vecmask)
extract_4s(i64, offsets)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
assemble_4s(i32, v, v1, v2, v3, v4)
ret <16 x i32> %v
}
define <16 x i32> @__gather32_i32(<16 x i32> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
extract_8s(i32, ptrs)
extract_8s(i32, vecmask)
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
<8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
<8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
assemble_8s(i32, v, v1, v2)
ret <16 x i32> %v
}
define <16 x i32> @__gather64_i32(<16 x i64> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
extract_4s(i64, ptrs)
extract_4s(i32, vecmask)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
assemble_4s(i32, v, v1, v2, v3, v4)
ret <16 x i32> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float gathers
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_8s(i32, offsets)
extract_8s(float, mask)
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
<8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
<8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
assemble_8s(float, v, v1, v2)
ret <16 x float> %v
}
define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_4s(i64, offsets)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
assemble_4s(float, v, v1, v2, v3, v4)
ret <16 x float> %v
}
define <16 x float> @__gather32_float(<16 x i32> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_8s(float, mask)
extract_8s(i32, ptrs)
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
<8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
<8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
assemble_8s(float, v, v1, v2)
ret <16 x float> %v
}
define <16 x float> @__gather64_float(<16 x i64> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_4s(i64, ptrs)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
assemble_4s(float, v, v1, v2, v3, v4)
ret <16 x float> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int64 gathers
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i32, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i64, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather32_i64(<16 x i32> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i32, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather64_i64(<16 x i64> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i64, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double gathers
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i32, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i64, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather32_double(<16 x i32> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i32, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather64_double(<16 x i64> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i64, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
')

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,16 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ifelse(LLVM_VERSION, `LLVM_3_0', `',
LLVM_VERSION, `LLVM_3_1', `',
`define(`HAVE_GATHER', `1')')
include(`target-avx.ll')
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
@@ -66,6 +74,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
ifelse(LLVM_VERSION, `LLVM_3_0', `
;; nothing to define...
', `
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -100,11 +111,323 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(8, i8)
gen_gather(8, i16)
gen_gather(8, i32)
gen_gather(8, i64)
declare void @llvm.trap() noreturn nounwind
define(`extract_4s', `
%$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
')
ifelse(LLVM_VERSION, `LLVM_3_0', `
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)',
LLVM_VERSION, `LLVM_3_1', `
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)', `
gen_gather(i8)
gen_gather(i16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 gathers
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
i32 %scale, <8 x i32> %offsets,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
<8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
ret <8 x i32> %v
}
define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
i32 %scale, <8 x i64> %offsets,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
extract_4s(i32, vecmask)
extract_4s(i64, offsets)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %v
}
define <8 x i32> @__gather32_i32(<8 x i32> %ptrs,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
<8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
ret <8 x i32> %v
}
define <8 x i32> @__gather64_i32(<8 x i64> %ptrs,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
extract_4s(i64, ptrs)
extract_4s(i32, vecmask)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float gathers
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
i32 %scale, <8 x i32> %offsets,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <8 x i32> %vecmask to <8 x float>
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
<8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
ret <8 x float> %v
}
define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
i32 %scale, <8 x i64> %offsets,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <8 x i32> %vecmask to <8 x float>
extract_4s(i64, offsets)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %v
}
define <8 x float> @__gather32_float(<8 x i32> %ptrs,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <8 x i32> %vecmask to <8 x float>
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
<8 x i32> %ptrs, <8 x float> %mask, i8 1)
ret <8 x float> %v
}
define <8 x float> @__gather64_float(<8 x i64> %ptrs,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <8 x i32> %vecmask to <8 x float>
extract_4s(i64, ptrs)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int64 gathers
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
i32 %scale, <8 x i32> %offsets,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
extract_4s(i32, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %v
}
define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
i32 %scale, <8 x i64> %offsets,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
extract_4s(i64, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %v
}
define <8 x i64> @__gather32_i64(<8 x i32> %ptrs,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
extract_4s(i32, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %v
}
define <8 x i64> @__gather64_i64(<8 x i64> %ptrs,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
extract_4s(i64, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double gathers
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
i32 %scale, <8 x i32> %offsets,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
extract_4s(i32, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %v
}
define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
i32 %scale, <8 x i64> %offsets,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
extract_4s(i64, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %v
}
define <8 x double> @__gather32_double(<8 x i32> %ptrs,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
extract_4s(i32, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %v
}
define <8 x double> @__gather64_double(<8 x i64> %ptrs,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
extract_4s(i64, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %v
}
')

100
builtins/target-generic-1.ll Executable file → Normal file
View File

@@ -13,42 +13,44 @@ aossoa()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
gen_masked_store(1, i8, 8)
gen_masked_store(1, i16, 16)
gen_masked_store(1, i32, 32)
gen_masked_store(1, i64, 64)
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(1, i8, 8)
load_and_broadcast(1, i16, 16)
load_and_broadcast(1, i32, 32)
load_and_broadcast(1, i64, 64)
masked_load(1, i8, 8, 1)
masked_load(1, i16, 16, 2)
masked_load(1, i32, 32, 4)
masked_load(1, i64, 64, 8)
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather(1, i8)
gen_gather(1, i16)
gen_gather(1, i32)
gen_gather(1, i64)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(1, i8)
gen_scatter(1, i16)
gen_scatter(1, i32)
gen_scatter(1, i64)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
<1 x i32> %mask) nounwind readnone alwaysinline {
<1 x i32> %mask) nounwind readnone alwaysinline {
; %mv = trunc <1 x i32> %mask to <1 x i8>
; %notmask = xor <1 x i8> %mv, <i8 -1>
; %cleared_old = and <1 x i8> %0, %notmask
@@ -69,7 +71,7 @@ define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
}
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
<1 x i32> %mask) nounwind readnone alwaysinline {
<1 x i32> %mask) nounwind readnone alwaysinline {
; %mv = trunc <1 x i32> %mask to <1 x i16>
; %notmask = xor <1 x i16> %mv, <i16 -1>
; %cleared_old = and <1 x i16> %0, %notmask
@@ -91,7 +93,7 @@ define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
<1 x i32> %mask) nounwind readnone alwaysinline {
<1 x i32> %mask) nounwind readnone alwaysinline {
; %notmask = xor <1 x i32> %mask, <i32 -1>
; %cleared_old = and <1 x i32> %0, %notmask
; %masked_new = and <1 x i32> %1, %mask
@@ -109,8 +111,9 @@ define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
ret <1 x i32> %r
}
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
<1 x i32> %mask) nounwind readnone alwaysinline {
<1 x i32> %mask) nounwind readnone alwaysinline {
; %newmask = zext <1 x i32> %mask to <1 x i64>
; %notmask = xor <1 x i64> %newmask, <i64 -1>
; %cleared_old = and <1 x i64> %0, %notmask
@@ -131,7 +134,7 @@ define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
}
define <1 x float> @__vselect_float(<1 x float>, <1 x float>,
<1 x i32> %mask) nounwind readnone alwaysinline {
<1 x i32> %mask) nounwind readnone alwaysinline {
; %v0 = bitcast <1 x float> %0 to <1 x i32>
; %v1 = bitcast <1 x float> %1 to <1 x i32>
; %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
@@ -154,23 +157,23 @@ define <1 x float> @__vselect_float(<1 x float>, <1 x float>,
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>,
define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
<1 x i32> %mask) nounwind alwaysinline {
%val = load <1 x i8> * %0, align 4
%newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask)
store <1 x i8> %newval, <1 x i8> * %0, align 4
ret void
}
define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>,
<1 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>,
<1 x i32> %mask) nounwind alwaysinline {
%val = load <1 x i16> * %0, align 4
%newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask)
store <1 x i16> %newval, <1 x i16> * %0, align 4
ret void
}
define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>,
<1 x i32> %mask) nounwind alwaysinline {
%val = load <1 x i32> * %0, align 4
%newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask)
@@ -178,20 +181,43 @@ define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
ret void
}
define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
<1 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
<1 x i32> %mask) nounwind alwaysinline {
%val = load <1 x i64> * %0, align 4
%newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask)
store <1 x i64> %newval, <1 x i64> * %0, align 4
ret void
}
define i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
masked_store_float_double()
define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31
%cmp = icmp eq i32 %v, 1
ret i1 %cmp
}
define i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -476,11 +502,6 @@ define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
ret i32 %r
}
define i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
%r = call i32 @__reduce_add_int32(<1 x i32> %v)
ret i32 %r
}
define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
%r = extractelement <1 x i32> %0, i32 0
ret i32 %r
@@ -932,4 +953,3 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone

View File

@@ -0,0 +1,33 @@
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
define(`WIDTH',`32')
include(`target-generic-common.ll')

View File

@@ -0,0 +1,33 @@
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
define(`WIDTH',`64')
include(`target-generic-common.ll')

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -29,12 +29,18 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";
define(`MASK',`i1')
define(`HAVE_GATHER',`1')
define(`HAVE_SCATTER',`1')
include(`util.m4')
stdlib_core()
scans()
reduce_equal(WIDTH)
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; broadcast/rotate/shuffle
@@ -46,6 +52,20 @@ declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
declare <WIDTH x float> @__setzero_float() nounwind readnone
declare <WIDTH x double> @__setzero_double() nounwind readnone
declare <WIDTH x i8> @__setzero_i8() nounwind readnone
declare <WIDTH x i16> @__setzero_i16() nounwind readnone
declare <WIDTH x i32> @__setzero_i32() nounwind readnone
declare <WIDTH x i64> @__setzero_i64() nounwind readnone
declare <WIDTH x float> @__undef_float() nounwind readnone
declare <WIDTH x double> @__undef_double() nounwind readnone
declare <WIDTH x i8> @__undef_i8() nounwind readnone
declare <WIDTH x i16> @__undef_i16() nounwind readnone
declare <WIDTH x i32> @__undef_i32() nounwind readnone
declare <WIDTH x i64> @__undef_i64() nounwind readnone
declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
@@ -201,7 +221,10 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reductions
declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone
declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone
declare i1 @__any(<WIDTH x i1>) nounwind readnone
declare i1 @__all(<WIDTH x i1>) nounwind readnone
declare i1 @__none(<WIDTH x i1>) nounwind readnone
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
@@ -211,7 +234,6 @@ declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
@@ -223,34 +245,48 @@ declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(WIDTH, i8, 8)
load_and_broadcast(WIDTH, i16, 16)
load_and_broadcast(WIDTH, i32, 32)
load_and_broadcast(WIDTH, i64, 64)
declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind
declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind
declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind
declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind
declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind
declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind
declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>,
<WIDTH x i1>) nounwind
declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind
declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
<WIDTH x i1> %mask) nounwind
ifelse(LLVM_VERSION, `LLVM_3_1svn',`
define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
ifelse(LLVM_VERSION, `LLVM_3_0', `
declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind
declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
<WIDTH x i1> %mask) nounwind
', `
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i8> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
@@ -258,57 +294,64 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
ret void
}
define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind alwaysinline {
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i16> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
ret void
}
define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind alwaysinline {
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i32> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
ret void
}
define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
<WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x float> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
store <WIDTH x float> %v1, <WIDTH x float> * %0
ret void
}
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i64> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
ret void
}
',`
declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind
define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
<WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x double> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
store <WIDTH x double> %v1, <WIDTH x double> * %0
ret void
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
define(`gather_scatter', `
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
<WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
<WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
<WIDTH x i1>) nounwind
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
@@ -318,7 +361,9 @@ declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
gather_scatter(i8)
gather_scatter(i16)
gather_scatter(i32)
gather_scatter(float)
gather_scatter(i64)
gather_scatter(double)
declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
<WIDTH x i1>) nounwind

View File

@@ -33,6 +33,7 @@ ctlztz()
define_prefetches()
define_shuffles()
aossoa()
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -309,7 +309,62 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp eq i32 %v, 255
ret i1 %cmp
}
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
define <4 x float> @__vec4_add_float(<4 x float> %v0,
@@ -360,11 +415,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
}
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
ret i32 %r
}
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
@@ -397,7 +447,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
}
define <4 x i64> @__add_varying_int64(<4 x i64>,
<4 x i64>) nounwind readnone alwaysinline {
<4 x i64>) nounwind readnone alwaysinline {
%r = add <4 x i64> %0, %1
ret <4 x i64> %r
}
@@ -432,28 +482,30 @@ reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(8, i8, 8)
load_and_broadcast(8, i16, 16)
load_and_broadcast(8, i32, 32)
load_and_broadcast(8, i64, 64)
masked_load(8, i8, 8, 1)
masked_load(8, i16, 16, 2)
masked_load(8, i32, 32, 4)
masked_load(8, i64, 64, 8)
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
gen_gather(8, i8)
gen_gather(8, i16)
gen_gather(8, i32)
gen_gather(8, i64)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(8, i8)
gen_scatter(8, i16)
gen_scatter(8, i32)
gen_scatter(8, i64)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float rounding
@@ -557,23 +609,23 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
gen_masked_store(8, i8, 8)
gen_masked_store(8, i16, 16)
gen_masked_store(8, i32, 32)
gen_masked_store(8, i64, 64)
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_blend_8_16_by_8()
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32> %mask) nounwind alwaysinline {
%val = load <8 x i32> * %0, align 4
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
store <8 x i32> %newval, <8 x i32> * %0, align 4
ret void
}
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<8 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<8 x i32> %mask) nounwind alwaysinline {
%oldValue = load <8 x i64>* %ptr, align 8
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
@@ -616,6 +668,8 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
ret void
}
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -239,10 +239,32 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 15
ret i1 %cmp
}
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
@@ -281,18 +303,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
}
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
ret i32 %r
}
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
}
define double @__reduce_add_double(<4 x double>) nounwind readnone {
@@ -349,16 +366,16 @@ reduce_equal(4)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
<4 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
<4 x i32> %mask) nounwind alwaysinline {
%val = load <4 x i32> * %0, align 4
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
store <4 x i32> %newval, <4 x i32> * %0, align 4
ret void
}
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
<4 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
<4 x i32> %mask) nounwind alwaysinline {
%oldValue = load <4 x i64>* %ptr, align 8
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
@@ -400,6 +417,8 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
}
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
@@ -551,35 +570,37 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
masked_store_blend_8_16_by_4()
gen_masked_store(4, i8, 8)
gen_masked_store(4, i16, 16)
gen_masked_store(4, i32, 32)
gen_masked_store(4, i64, 64)
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(4, i8, 8)
load_and_broadcast(4, i16, 16)
load_and_broadcast(4, i32, 32)
load_and_broadcast(4, i64, 64)
masked_load(4, i8, 8, 1)
masked_load(4, i16, 16, 2)
masked_load(4, i32, 32, 4)
masked_load(4, i64, 64, 8)
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather(4, i8)
gen_gather(4, i16)
gen_gather(4, i32)
gen_gather(4, i64)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(4, i8)
gen_scatter(4, i16)
gen_scatter(4, i32)
gen_scatter(4, i64)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)

View File

@@ -33,6 +33,7 @@ ctlztz()
define_prefetches()
define_shuffles()
aossoa()
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -221,13 +221,13 @@ define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly al
; unsigned int min/max
define <8 x i32> @__min_varying_uint32(<8 x i32>,
<8 x i32>) nounwind readonly alwaysinline {
<8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <8 x i32> %call
}
define <8 x i32> @__max_varying_uint32(<8 x i32>,
<8 x i32>) nounwind readonly alwaysinline {
<8 x i32>) nounwind readonly alwaysinline {
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <8 x i32> %call
}
@@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -251,7 +251,62 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp eq i32 %v, 255
ret i1 %cmp
}
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
@@ -287,11 +342,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
}
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
ret i32 %r
}
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
}
@@ -324,7 +374,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
}
define <4 x i64> @__add_varying_int64(<4 x i64>,
<4 x i64>) nounwind readnone alwaysinline {
<4 x i64>) nounwind readnone alwaysinline {
%r = add <4 x i64> %0, %1
ret <4 x i64> %r
}
@@ -359,28 +409,30 @@ reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(8, i8, 8)
load_and_broadcast(8, i16, 16)
load_and_broadcast(8, i32, 32)
load_and_broadcast(8, i64, 64)
masked_load(8, i8, 8, 1)
masked_load(8, i16, 16, 2)
masked_load(8, i32, 32, 4)
masked_load(8, i64, 64, 8)
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
gen_gather(8, i8)
gen_gather(8, i16)
gen_gather(8, i32)
gen_gather(8, i64)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(8, i8)
gen_scatter(8, i16)
gen_scatter(8, i32)
gen_scatter(8, i64)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float rounding
@@ -443,18 +495,18 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
gen_masked_store(8, i8, 8)
gen_masked_store(8, i16, 16)
gen_masked_store(8, i32, 32)
gen_masked_store(8, i64, 64)
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_blend_8_16_by_8()
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
<4 x float>) nounwind readnone
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32> %mask) nounwind alwaysinline {
; do two 4-wide blends with blendvps
%mask_as_float = bitcast <8 x i32> %mask to <8 x float>
%mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
@@ -483,8 +535,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
ret void
}
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<8 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<8 x i32> %mask) nounwind alwaysinline {
; implement this as 4 blends of <4 x i32>s, which are actually bitcast
; <2 x i64>s...
@@ -550,6 +602,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
ret void
}
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -271,10 +271,32 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
ret i32 %v
%v64 = zext i32 %v to i64
ret i64 %v64
}
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 15
ret i1 %cmp
}
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
@@ -312,18 +334,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
}
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
ret i32 %r
}
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
}
define double @__reduce_add_double(<4 x double>) nounwind readnone {
@@ -383,8 +400,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
<4 x float>) nounwind readnone
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
<4 x i32> %mask) nounwind alwaysinline {
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
<4 x i32> %mask) nounwind alwaysinline {
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
%oldValue = load <4 x i32>* %0, align 4
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
@@ -398,8 +415,8 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
}
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
<4 x i32> %i32mask) nounwind alwaysinline {
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
<4 x i32> %i32mask) nounwind alwaysinline {
%oldValue = load <4 x i64>* %ptr, align 8
%mask = bitcast <4 x i32> %i32mask to <4 x float>
@@ -450,35 +467,39 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
masked_store_blend_8_16_by_4()
gen_masked_store(4, i8, 8)
gen_masked_store(4, i16, 16)
gen_masked_store(4, i32, 32)
gen_masked_store(4, i64, 64)
gen_masked_store(i8)
gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
masked_store_float_double()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts
load_and_broadcast(4, i8, 8)
load_and_broadcast(4, i16, 16)
load_and_broadcast(4, i32, 32)
load_and_broadcast(4, i64, 64)
masked_load(4, i8, 8, 1)
masked_load(4, i16, 16, 2)
masked_load(4, i32, 32, 4)
masked_load(4, i64, 64, 8)
masked_load(i8, 1)
masked_load(i16, 2)
masked_load(i32, 4)
masked_load(float, 4)
masked_load(i64, 8)
masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
; define these with the macros from stdlib.m4
gen_gather(4, i8)
gen_gather(4, i16)
gen_gather(4, i32)
gen_gather(4, i64)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(4, i8)
gen_scatter(4, i16)
gen_scatter(4, i32)
gen_scatter(4, i64)
gen_scatter(i8)
gen_scatter(i16)
gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -17,7 +17,7 @@ syn keyword ispcStatement cbreak ccontinue creturn launch print reference soa sy
syn keyword ispcConditional cif
syn keyword ispcRepeat cdo cfor cwhile
syn keyword ispcBuiltin programCount programIndex
syn keyword ispcType export int8 int16 int32 int64
syn keyword ispcType export uniform varying int8 int16 int32 int64
" Default highlighting
command -nargs=+ HiLink hi def link <args>

8
contrib/ispc.vim.README Normal file
View File

@@ -0,0 +1,8 @@
To install vim syntax highlighting for ispc files:
1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
2) Create a filetype for ispc files to correspond to that syntax file
To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
au BufRead,BufNewFile *.ispc set filetype=ispc

1753
ctx.cpp

File diff suppressed because it is too large Load Diff

117
ctx.h
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -40,10 +40,20 @@
#include "ispc.h"
#include <map>
#include <llvm/InstrTypes.h>
#include <llvm/Instructions.h>
#include <llvm/Analysis/DIBuilder.h>
#include <llvm/Analysis/DebugInfo.h>
#if defined(LLVM_3_1) || defined(LLVM_3_2)
#include <llvm/InstrTypes.h>
#include <llvm/Instructions.h>
#else
#include <llvm/IR/InstrTypes.h>
#include <llvm/IR/Instructions.h>
#endif
#if defined(LLVM_3_1)
#include <llvm/Analysis/DebugInfo.h>
#include <llvm/Analysis/DIBuilder.h>
#else
#include <llvm/DebugInfo.h>
#include <llvm/DIBuilder.h>
#endif
struct CFInfo;
@@ -153,16 +163,17 @@ public:
bool uniformControlFlow);
/** Informs FunctionEmitContext of the value of the mask at the start
of a loop body. */
void SetLoopMask(llvm::Value *mask);
of a loop body or switch statement. */
void SetBlockEntryMask(llvm::Value *mask);
/** Informs FunctionEmitContext that code generation for a loop is
finished. */
void EndLoop();
/** Indicates that code generation for a 'foreach' or 'foreach_tiled'
loop is about to start. */
void StartForeach();
/** Indicates that code generation for a 'foreach', 'foreach_tiled',
'foreach_active', or 'foreach_unique' loop is about to start. */
enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
void StartForeach(ForeachType ft);
void EndForeach();
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
@@ -230,6 +241,13 @@ public:
bool InForeachLoop() const;
/** Temporarily disables emission of performance warnings from gathers
and scatters from subsequent code. */
void DisableGatherScatterWarnings();
/** Reenables emission of gather/scatter performance warnings. */
void EnableGatherScatterWarnings();
void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
/** Step through the code and find label statements; create a basic
@@ -241,6 +259,10 @@ public:
new basic block that it starts. */
llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
/** Returns a vector of all labels in the context. This is
simply the key set of the labelMap */
std::vector<std::string> GetLabels();
/** Called to generate code for 'return' statement; value is the
expression in the return statement (if non-NULL), and
doCoherenceCheck indicates whether instructions should be generated
@@ -265,7 +287,7 @@ public:
llvm::Value *None(llvm::Value *mask);
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
i32 value wherein the i'th bit is on if and only if the i'th lane
i64 value wherein the i'th bit is on if and only if the i'th lane
of the mask is on. */
llvm::Value *LaneMask(llvm::Value *mask);
@@ -331,7 +353,7 @@ public:
/** Emits debugging information for the function parameter represented
by sym. */
void EmitFunctionParameterDebugInfo(Symbol *sym);
void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
/** @} */
/** @name IR instruction emission
@@ -373,25 +395,35 @@ public:
array, for pointer types). */
llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
const char *name = NULL);
llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
const char *name = NULL);
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
const char *name = NULL);
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
const char *name = NULL);
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
llvm::Type *type, const char *name = NULL);
llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type,
const char *name = NULL);
llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type,
const char *name = NULL);
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type,
const char *name = NULL);
/** Given two integer-typed values (but possibly one vector and the
other not, and or of possibly-different bit-widths), update their
values as needed so that the two have the same (more general)
type. */
void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
/** Create a new slice pointer out of the given pointer to an soa type
and an integer offset to a slice within that type. */
llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
/** These GEP methods are generalizations of the standard ones in LLVM;
they support both uniform and varying basePtr values as well as
uniform and varying index values (arrays of indices). Varying base
@@ -412,7 +444,8 @@ public:
the type of the pointer, though it may be NULL if the base pointer
is uniform. */
llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
const Type *ptrType, const char *name = NULL);
const Type *ptrType, const char *name = NULL,
const PointerType **resultPtrType = NULL);
/** Load from the memory location(s) given by lvalue, using the given
mask. The lvalue may be varying, in which case this corresponds to
@@ -430,7 +463,7 @@ public:
instruction is added at the start of the function in the entry
basic block; if it should be added to the current basic block, then
the atEntryBlock parameter should be false. */
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType,
llvm::Value *AllocaInst(llvm::Type *llvmType,
const char *name = NULL, int align = 0,
bool atEntryBlock = true);
@@ -443,7 +476,14 @@ public:
varying, the given storeMask is used to mask the stores so that
they only execute for the active program instances. */
void StoreInst(llvm::Value *value, llvm::Value *ptr,
llvm::Value *storeMask, const Type *ptrType);
llvm::Value *storeMask, const Type *valueType,
const Type *ptrType);
/** Copy count bytes of memory from the location pointed to by src to
the location pointed to by dest. (src and dest must not be
overlapping.) */
void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
llvm::Value *align = NULL);
void BranchInst(llvm::BasicBlock *block);
void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -460,7 +500,7 @@ public:
llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
const char *name = NULL);
llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
llvm::PHINode *PhiNode(llvm::Type *type, int count,
const char *name = NULL);
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
llvm::Value *val1, const char *name = NULL);
@@ -531,9 +571,9 @@ private:
for error messages and debugging symbols. */
SourcePos funcStartPos;
/** If currently in a loop body, the value of the mask at the start of
the loop. */
llvm::Value *loopMask;
/** If currently in a loop body or switch statement, the value of the
mask at the start of it. */
llvm::Value *blockEntryMask;
/** If currently in a loop body or switch statement, this is a pointer
to memory to store a mask value that represents which of the lanes
@@ -607,12 +647,12 @@ private:
std::vector<CFInfo *> controlFlowInfo;
/** DIFile object corresponding to the source file where the current
function was defined (used for debugging info0. */
function was defined (used for debugging info). */
llvm::DIFile diFile;
/** DISubprogram corresponding to this function (used for debugging
info). */
llvm::DISubprogram diFunction;
llvm::DISubprogram diSubprogram;
/** These correspond to the current set of nested scopes in the
function. */
@@ -626,6 +666,10 @@ private:
tasks launched from the current function. */
llvm::Value *launchGroupHandlePtr;
/** Nesting count of the number of times calling code has disabled (and
not yet reenabled) gather/scatter performance warnings. */
int disableGSWarningCount;
std::map<std::string, llvm::BasicBlock *> labelMap;
static bool initLabelBBlocks(ASTNode *node, void *data);
@@ -646,12 +690,19 @@ private:
CFInfo *popCFState();
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
llvm::Value *mask);
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
const Type *ptrType, llvm::Value *mask);
void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
llvm::Value *mask);
llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
const char *name);
void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr,
llvm::Value *mask, const Type *valueType,
const PointerType *ptrType);
llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
const PointerType *ptrType, const char *name);
llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
llvm::Value *mask, const char *name);
llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
};

590
decl.cpp
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -33,7 +33,7 @@
/** @file decl.cpp
@brief Implementations of classes related to turning declarations into
symbols and types.
symbol names and types.
*/
#include "decl.h"
@@ -44,6 +44,7 @@
#include "stmt.h"
#include "expr.h"
#include <stdio.h>
#include <string.h>
#include <set>
static void
@@ -55,6 +56,8 @@ lPrintTypeQualifiers(int typeQualifiers) {
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
if (typeQualifiers & TYPEQUAL_EXPORT) printf("export ");
if (typeQualifiers & TYPEQUAL_UNMASKED) printf("unmasked ");
}
@@ -69,12 +72,21 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
type = type->GetAsConstType();
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
type = type->GetAsUniformType();
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
type = type->GetAsVaryingType();
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
if (Type::Equal(type, AtomicType::Void))
Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
else
type = type->GetAsUniformType();
}
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
if (Type::Equal(type, AtomicType::Void))
Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
else
type = type->GetAsVaryingType();
}
else
type = type->GetAsUnboundVariabilityType();
if (Type::Equal(type, AtomicType::Void) == false)
type = type->GetAsUnboundVariabilityType();
if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
@@ -84,15 +96,20 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
const Type *unsignedType = type->GetAsUnsignedType();
if (unsignedType != NULL)
type = unsignedType;
else
else {
const Type *resolvedType =
type->ResolveUnboundVariability(Variability::Varying);
Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
resolvedType->GetString().c_str());
}
}
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
const Type *resolvedType =
type->ResolveUnboundVariability(Variability::Varying);
Error(pos, "\"signed\" qualifier is illegal with non-integer type "
"\"%s\".",
type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
"\"%s\".", resolvedType->GetString().c_str());
}
return type;
}
@@ -112,18 +129,59 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
const Type *
DeclSpecs::GetBaseType(SourcePos pos) const {
const Type *bt = baseType;
const Type *retType = baseType;
if (retType == NULL) {
Warning(pos, "No type specified in declaration. Assuming int32.");
retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
}
if (vectorSize > 0) {
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
const AtomicType *atomicType = CastType<AtomicType>(retType);
if (atomicType == NULL) {
Error(pos, "Only atomic types (int, float, ...) are legal for vector "
"types.");
return NULL;
}
bt = new VectorType(atomicType, vectorSize);
retType = new VectorType(atomicType, vectorSize);
}
return lApplyTypeQualifiers(typeQualifiers, bt, pos);
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
if (soaWidth > 0) {
const StructType *st = CastType<StructType>(retType);
if (st == NULL) {
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
"type \"%s\".", soaWidth, retType->GetString().c_str());
return NULL;
}
else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
Error(pos, "soa<%d> width illegal. Value must be positive power "
"of two.", soaWidth);
return NULL;
}
if (st->IsUniformType()) {
Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
"both be used in a type declaration.", soaWidth);
return NULL;
}
else if (st->IsVaryingType()) {
Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
"both be used in a type declaration.", soaWidth);
return NULL;
}
else
retType = st->GetAsSOAType(soaWidth);
if (soaWidth < g->target.vectorWidth)
PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
"currently leads to inefficient code to access "
"soa types.", soaWidth, g->target.vectorWidth);
}
return retType;
}
@@ -133,7 +191,6 @@ lGetStorageClassName(StorageClass storageClass) {
case SC_NONE: return "";
case SC_EXTERN: return "extern";
case SC_EXTERN_C: return "extern \"C\"";
case SC_EXPORT: return "export";
case SC_STATIC: return "static";
case SC_TYPEDEF: return "typedef";
default: FATAL("Unhandled storage class in lGetStorageClassName");
@@ -162,31 +219,30 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
: pos(p), kind(dk) {
child = NULL;
typeQualifiers = 0;
storageClass = SC_NONE;
arraySize = -1;
sym = NULL;
type = NULL;
initExpr = NULL;
}
void
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
const Type *t = GetType(ds);
Symbol *sym = GetSymbol();
if (sym != NULL) {
sym->type = t;
sym->storageClass = ds->storageClass;
const Type *baseType = ds->GetBaseType(pos);
InitFromType(baseType, ds);
if (type == NULL) {
AssertPos(pos, m->errorCount > 0);
return;
}
}
storageClass = ds->storageClass;
Symbol *
Declarator::GetSymbol() const {
// The symbol lives at the last child in the chain, so walk down there
// and return the one there.
const Declarator *d = this;
while (d->child != NULL)
d = d->child;
return d->sym;
if (ds->declSpecList.size() > 0 &&
CastType<FunctionType>(type) == NULL) {
Error(pos, "__declspec specifiers for non-function type \"%s\" are "
"not used.", type->GetString().c_str());
}
}
@@ -196,11 +252,11 @@ Declarator::Print(int indent) const {
pos.Print();
lPrintTypeQualifiers(typeQualifiers);
Symbol *sym = GetSymbol();
if (sym != NULL)
printf("%s", sym->name.c_str());
printf("%s ", lGetStorageClassName(storageClass));
if (name.size() > 0)
printf("%s", name.c_str());
else
printf("(null symbol)");
printf("(unnamed)");
printf(", array size = %d", arraySize);
@@ -234,132 +290,157 @@ Declarator::Print(int indent) const {
}
Symbol *
Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
const FunctionType *type =
dynamic_cast<const FunctionType *>(GetType(ds));
if (type == NULL)
return NULL;
Symbol *declSym = GetSymbol();
Assert(declSym != NULL);
// Get the symbol for the function from the symbol table. (It should
// already have been added to the symbol table by AddGlobal() by the
// time we get here.)
Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
if (funSym != NULL)
// May be NULL due to error earlier in compilation
funSym->pos = pos;
// Walk down to the declarator for the function. (We have to get past
// the stuff that specifies the function's return type before we get to
// the function's declarator.)
Declarator *d = this;
while (d != NULL && d->kind != DK_FUNCTION)
d = d->child;
Assert(d != NULL);
for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
Symbol *sym = d->GetSymbolForFunctionParameter(i);
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
funArgs->push_back(sym);
}
if (funSym != NULL)
funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
return funSym;
}
const Type *
Declarator::GetType(const Type *base, DeclSpecs *ds) const {
void
Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
bool isTask = ((typeQualifiers & TYPEQUAL_TASK) != 0);
bool isExported = ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
bool isConst = ((typeQualifiers & TYPEQUAL_CONST) != 0);
bool isUnmasked = ((typeQualifiers & TYPEQUAL_UNMASKED) != 0);
if (hasUniformQual && hasVaryingQual) {
Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
return NULL;
return;
}
if (kind != DK_FUNCTION && isTask)
if (kind != DK_FUNCTION && isTask) {
Error(pos, "\"task\" qualifier illegal in variable declaration.");
Type::Variability variability = Type::Unbound;
return;
}
if (kind != DK_FUNCTION && isUnmasked) {
Error(pos, "\"unmasked\" qualifier illegal in variable declaration.");
return;
}
if (kind != DK_FUNCTION && isExported) {
Error(pos, "\"export\" qualifier illegal in variable declaration.");
return;
}
Variability variability(Variability::Unbound);
if (hasUniformQual)
variability = Type::Uniform;
variability = Variability::Uniform;
else if (hasVaryingQual)
variability = Type::Varying;
variability = Variability::Varying;
const Type *type = base;
switch (kind) {
case DK_BASE:
if (kind == DK_BASE) {
// All of the type qualifiers should be in the DeclSpecs for the
// base declarator
Assert(typeQualifiers == 0);
Assert(child == NULL);
return type;
case DK_POINTER:
type = new PointerType(type, variability, isConst);
if (child != NULL)
return child->GetType(type, ds);
AssertPos(pos, typeQualifiers == 0);
AssertPos(pos, child == NULL);
type = baseType;
}
else if (kind == DK_POINTER) {
/* For now, any pointer to an SOA type gets the slice property; if
we add the capability to declare pointers as slices or not,
we'll want to set this based on a type qualifier here. */
const Type *ptrType = new PointerType(baseType, variability, isConst,
baseType->IsSOAType());
if (child != NULL) {
child->InitFromType(ptrType, ds);
type = child->type;
name = child->name;
}
else
return type;
break;
case DK_REFERENCE:
if (hasUniformQual)
type = ptrType;
}
else if (kind == DK_REFERENCE) {
if (hasUniformQual) {
Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
if (hasVaryingQual)
return;
}
if (hasVaryingQual) {
Error(pos, "\"varying\" qualifier is illegal to apply to references.");
if (isConst)
return;
}
if (isConst) {
Error(pos, "\"const\" qualifier is to illegal apply to references.");
return;
}
// The parser should disallow this already, but double check.
if (dynamic_cast<const ReferenceType *>(type) != NULL) {
if (CastType<ReferenceType>(baseType) != NULL) {
Error(pos, "References to references are illegal.");
return NULL;
return;
}
type = new ReferenceType(type);
if (child != NULL)
return child->GetType(type, ds);
const Type *refType = new ReferenceType(baseType);
if (child != NULL) {
child->InitFromType(refType, ds);
type = child->type;
name = child->name;
}
else
return type;
break;
type = refType;
}
else if (kind == DK_ARRAY) {
if (Type::Equal(baseType, AtomicType::Void)) {
Error(pos, "Arrays of \"void\" type are illegal.");
return;
}
if (CastType<ReferenceType>(baseType)) {
Error(pos, "Arrays of references (type \"%s\") are illegal.",
baseType->GetString().c_str());
return;
}
case DK_ARRAY:
type = new ArrayType(type, arraySize);
if (child)
return child->GetType(type, ds);
const Type *arrayType = new ArrayType(baseType, arraySize);
if (child != NULL) {
child->InitFromType(arrayType, ds);
type = child->type;
name = child->name;
}
else
return type;
break;
case DK_FUNCTION: {
std::vector<const Type *> args;
std::vector<std::string> argNames;
std::vector<ConstExpr *> argDefaults;
std::vector<SourcePos> argPos;
type = arrayType;
}
else if (kind == DK_FUNCTION) {
llvm::SmallVector<const Type *, 8> args;
llvm::SmallVector<std::string, 8> argNames;
llvm::SmallVector<Expr *, 8> argDefaults;
llvm::SmallVector<SourcePos, 8> argPos;
// Loop over the function arguments and store the names, types,
// default values (if any), and source file positions each one in
// the corresponding vector.
for (unsigned int i = 0; i < functionParams.size(); ++i) {
Declaration *d = functionParams[i];
Symbol *sym = GetSymbolForFunctionParameter(i);
if (d == NULL) {
AssertPos(pos, m->errorCount > 0);
continue;
}
if (d->declarators.size() == 0) {
// function declaration like foo(float), w/o a name for the
// parameter; wire up a placeholder Declarator for it
d->declarators.push_back(new Declarator(DK_BASE, pos));
d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
}
AssertPos(pos, d->declarators.size() == 1);
Declarator *decl = d->declarators[0];
if (decl == NULL || decl->type == NULL) {
AssertPos(pos, m->errorCount > 0);
continue;
}
if (decl->name == "") {
// Give a name to any anonymous parameter declarations
char buf[32];
sprintf(buf, "__anon_parameter_%d", i);
decl->name = buf;
}
decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
if (d->declSpecs->storageClass != SC_NONE)
Error(sym->pos, "Storage class \"%s\" is illegal in "
Error(decl->pos, "Storage class \"%s\" is illegal in "
"function parameter declaration for parameter \"%s\".",
lGetStorageClassName(d->declSpecs->storageClass),
sym->name.c_str());
decl->name.c_str());
if (Type::Equal(decl->type, AtomicType::Void)) {
Error(decl->pos, "Parameter with type \"void\" illegal in function "
"parameter list.");
decl->type = NULL;
}
const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
const ArrayType *at = CastType<ArrayType>(decl->type);
if (at != NULL) {
// As in C, arrays are passed to functions as pointers to
// their element type. We'll just immediately make this
@@ -369,144 +450,124 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
// report this differently than it was originally declared
// in the function, but it's not clear that this is a
// significant problem.)
sym->type = PointerType::GetUniform(at->GetElementType());
const Type *targetType = at->GetElementType();
if (targetType == NULL) {
AssertPos(pos, m->errorCount > 0);
return;
}
decl->type = PointerType::GetUniform(targetType, at->IsSOAType());
// Make sure there are no unsized arrays (other than the
// first dimension) in function parameter lists.
at = dynamic_cast<const ArrayType *>(at->GetElementType());
at = CastType<ArrayType>(targetType);
while (at != NULL) {
if (at->GetElementCount() == 0)
Error(sym->pos, "Arrays with unsized dimensions in "
Error(decl->pos, "Arrays with unsized dimensions in "
"dimensions after the first one are illegal in "
"function parameter lists.");
at = dynamic_cast<const ArrayType *>(at->GetElementType());
at = CastType<ArrayType>(at->GetElementType());
}
}
args.push_back(sym->type);
argNames.push_back(sym->name);
argPos.push_back(sym->pos);
args.push_back(decl->type);
argNames.push_back(decl->name);
argPos.push_back(decl->pos);
ConstExpr *init = NULL;
if (d->declarators.size()) {
// Try to find an initializer expression; if there is one,
// it lives down to the base declarator.
Declarator *decl = d->declarators[0];
while (decl->child != NULL) {
Assert(decl->initExpr == NULL);
Expr *init = NULL;
// Try to find an initializer expression.
while (decl != NULL) {
if (decl->initExpr != NULL) {
decl->initExpr = TypeCheck(decl->initExpr);
decl->initExpr = Optimize(decl->initExpr);
if (decl->initExpr != NULL) {
init = dynamic_cast<ConstExpr *>(decl->initExpr);
if (init == NULL)
init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
if (init == NULL)
Error(decl->initExpr->pos, "Default value for parameter "
"\"%s\" must be a compile-time constant.",
decl->name.c_str());
}
break;
}
else
decl = decl->child;
}
if (decl->initExpr != NULL &&
(decl->initExpr = TypeCheck(decl->initExpr)) != NULL &&
(decl->initExpr = Optimize(decl->initExpr)) != NULL &&
(init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
Error(decl->initExpr->pos, "Default value for parameter "
"\"%s\" must be a compile-time constant.",
sym->name.c_str());
}
}
argDefaults.push_back(init);
}
const Type *returnType = type;
const Type *returnType = baseType;
if (returnType == NULL) {
Error(pos, "No return type provided in function declaration.");
return NULL;
return;
}
if (CastType<FunctionType>(returnType) != NULL) {
Error(pos, "Illegal to return function type from function.");
return;
}
bool isExported = ds && (ds->storageClass == SC_EXPORT);
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
if (isExported && isTask) {
Error(pos, "Function can't have both \"task\" and \"export\" "
"qualifiers");
return NULL;
return;
}
if (isExternC && isTask) {
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
"qualifiers");
return NULL;
return;
}
if (isExternC && isExported) {
Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
"qualifiers");
return NULL;
return;
}
if (isUnmasked && isExported)
Warning(pos, "\"unmasked\" qualifier is redundant for exported "
"functions.");
if (child == NULL) {
AssertPos(pos, m->errorCount > 0);
return;
}
const Type *functionType =
const FunctionType *functionType =
new FunctionType(returnType, args, argNames, argDefaults,
argPos, isTask, isExported, isExternC);
functionType = functionType->ResolveUnboundVariability(Type::Varying);
return child->GetType(functionType, ds);
}
default:
FATAL("Unexpected decl kind");
return NULL;
}
argPos, isTask, isExported, isExternC, isUnmasked);
#if 0
// Make sure we actually have an array of structs ..
const StructType *childStructType =
dynamic_cast<const StructType *>(childType);
if (childStructType == NULL) {
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
"type \"%s\".", soaWidth, childType->GetString().c_str());
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
// handle any explicit __declspecs on the function
if (ds != NULL) {
for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
std::string str = ds->declSpecList[i].first;
SourcePos pos = ds->declSpecList[i].second;
if (str == "safe")
(const_cast<FunctionType *>(functionType))->isSafe = true;
else if (!strncmp(str.c_str(), "cost", 4)) {
int cost = atoi(str.c_str() + 4);
if (cost < 0)
Error(pos, "Negative function cost %d is illegal.",
cost);
(const_cast<FunctionType *>(functionType))->costOverride = cost;
}
else
Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
}
else if ((soaWidth & (soaWidth - 1)) != 0) {
Error(pos, "soa<%d> width illegal. Value must be power of two.",
soaWidth);
return NULL;
}
else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
Error(pos, "soa<%d> width must evenly divide array size %d.",
soaWidth, arraySize);
return NULL;
}
return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
soaWidth);
#endif
}
const Type *
Declarator::GetType(DeclSpecs *ds) const {
const Type *baseType = ds->GetBaseType(pos);
const Type *type = GetType(baseType, ds);
return type;
}
Symbol *
Declarator::GetSymbolForFunctionParameter(int paramNum) const {
Assert(paramNum < (int)functionParams.size());
Declaration *d = functionParams[paramNum];
char buf[32];
Symbol *sym;
if (d->declarators.size() == 0) {
// function declaration like foo(float), w/o a name for
// the parameter
sprintf(buf, "__anon_parameter_%d", paramNum);
sym = new Symbol(buf, pos);
sym->type = d->declSpecs->GetBaseType(pos);
}
else {
Assert(d->declarators.size() == 1);
sym = d->declarators[0]->GetSymbol();
if (sym == NULL) {
// Handle more complex anonymous declarations like
// float (float **).
sprintf(buf, "__anon_parameter_%d", paramNum);
sym = new Symbol(buf, d->declarators[0]->pos);
sym->type = d->declarators[0]->GetType(d->declSpecs);
}
}
return sym;
}
child->InitFromType(functionType, ds);
type = child->type;
name = child->name;
}
}
///////////////////////////////////////////////////////////////////////////
// Declaration
@@ -537,18 +598,23 @@ Declaration::GetVariableDeclarations() const {
for (unsigned int i = 0; i < declarators.size(); ++i) {
Declarator *decl = declarators[i];
if (decl == NULL)
if (decl == NULL || decl->type == NULL) {
// Ignore earlier errors
Assert(m->errorCount > 0);
continue;
}
Symbol *sym = decl->GetSymbol();
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
if (Type::Equal(decl->type, AtomicType::Void))
Error(decl->pos, "\"void\" type variable illegal in declaration.");
else if (CastType<FunctionType>(decl->type) == NULL) {
decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
decl->storageClass);
m->symbolTable->AddVariable(sym);
vars.push_back(VariableDeclaration(sym, decl->initExpr));
}
}
return vars;
}
@@ -559,18 +625,19 @@ Declaration::DeclareFunctions() {
for (unsigned int i = 0; i < declarators.size(); ++i) {
Declarator *decl = declarators[i];
if (decl == NULL)
if (decl == NULL || decl->type == NULL) {
// Ignore earlier errors
Assert(m->errorCount > 0);
continue;
}
Symbol *sym = decl->GetSymbol();
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
const FunctionType *ftype = CastType<FunctionType>(decl->type);
if (ftype == NULL)
continue;
bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
m->AddFunctionDeclaration(sym, isInline);
m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
isInline, decl->pos);
}
}
@@ -584,13 +651,14 @@ Declaration::Print(int indent) const {
declarators[i]->Print(indent+4);
}
///////////////////////////////////////////////////////////////////////////
void
GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
std::vector<const Type *> *elementTypes,
std::vector<std::string> *elementNames,
std::vector<SourcePos> *elementPositions) {
llvm::SmallVector<const Type *, 8> *elementTypes,
llvm::SmallVector<std::string, 8> *elementNames,
llvm::SmallVector<SourcePos, 8> *elementPositions) {
std::set<std::string> seenNames;
for (unsigned int i = 0; i < sd.size(); ++i) {
const Type *type = sd[i]->type;
@@ -600,35 +668,41 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
// FIXME: making this fake little DeclSpecs here is really
// disgusting
DeclSpecs ds(type);
if (type->IsUniformType())
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
else if (type->IsVaryingType())
ds.typeQualifiers |= TYPEQUAL_VARYING;
if (Type::Equal(type, AtomicType::Void) == false) {
if (type->IsUniformType())
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
else if (type->IsVaryingType())
ds.typeQualifiers |= TYPEQUAL_VARYING;
else if (type->GetSOAWidth() != 0)
ds.soaWidth = type->GetSOAWidth();
// FIXME: ds.vectorSize?
}
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
Declarator *d = (*sd[i]->declarators)[j];
d->InitFromDeclSpecs(&ds);
Symbol *sym = d->GetSymbol();
if (Type::Equal(d->type, AtomicType::Void))
Error(d->pos, "\"void\" type illegal for struct member.");
const ArrayType *arrayType =
dynamic_cast<const ArrayType *>(sym->type);
if (arrayType != NULL && arrayType->GetElementCount() == 0) {
Error(d->pos, "Unsized arrays aren't allowed in struct "
"definitions.");
elementTypes->push_back(NULL);
}
else
elementTypes->push_back(sym->type);
elementTypes->push_back(d->type);
if (seenNames.find(sym->name) != seenNames.end())
if (seenNames.find(d->name) != seenNames.end())
Error(d->pos, "Struct member \"%s\" has same name as a "
"previously-declared member.", sym->name.c_str());
"previously-declared member.", d->name.c_str());
else
seenNames.insert(sym->name);
seenNames.insert(d->name);
elementNames->push_back(sym->name);
elementPositions->push_back(sym->pos);
elementNames->push_back(d->name);
elementPositions->push_back(d->pos);
}
}
for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
if (arrayType != NULL && arrayType->GetElementCount() == 0)
Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
"for the last member in a struct definition.");
}
}

60
decl.h
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -47,30 +47,21 @@
variables--here, that the declaration has the 'static' and 'uniform'
qualifiers, and that it's basic type is 'int'. Then for each variable
declaration, the Declaraiton class holds an instance of a Declarator,
which in turn records the per-variable information like the symbol
name, array size (if any), initializer expression, etc.
which in turn records the per-variable information like the name, array
size (if any), initializer expression, etc.
*/
#ifndef ISPC_DECL_H
#define ISPC_DECL_H
#include "ispc.h"
#include <llvm/ADT/SmallVector.h>
struct VariableDeclaration;
class Declaration;
class Declarator;
enum StorageClass {
SC_NONE,
SC_EXTERN,
SC_EXPORT,
SC_STATIC,
SC_TYPEDEF,
SC_EXTERN_C
};
/* Multiple qualifiers can be provided with types in declarations;
therefore, they are set up so that they can be ANDed together into an
int. */
@@ -82,6 +73,8 @@ enum StorageClass {
#define TYPEQUAL_SIGNED (1<<4)
#define TYPEQUAL_UNSIGNED (1<<5)
#define TYPEQUAL_INLINE (1<<6)
#define TYPEQUAL_EXPORT (1<<7)
#define TYPEQUAL_UNMASKED (1<<8)
/** @brief Representation of the declaration specifiers in a declaration.
@@ -90,7 +83,8 @@ enum StorageClass {
*/
class DeclSpecs {
public:
DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
int tq = TYPEQUAL_NONE);
void Print() const;
@@ -117,6 +111,8 @@ public:
SOA width specified. Otherwise this is zero.
*/
int soaWidth;
std::vector<std::pair<std::string, SourcePos> > declSpecList;
};
@@ -138,25 +134,11 @@ public:
Declarator(DeclaratorKind dk, SourcePos p);
/** Once a DeclSpecs instance is available, this method completes the
initialization of the Symbol, setting its Type accordingly.
initialization of the type member.
*/
void InitFromDeclSpecs(DeclSpecs *ds);
/** Get the actual type of the combination of Declarator and the given
DeclSpecs. If an explicit base type is provided, the declarator is
applied to that type; otherwise the base type from the DeclSpecs is
used. */
const Type *GetType(DeclSpecs *ds) const;
const Type *GetType(const Type *base, DeclSpecs *ds) const;
/** Returns the symbol corresponding to the function declared by this
declarator and symbols for its arguments in *args. */
Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
Symbol *GetSymbolForFunctionParameter(int paramNum) const;
/** Returns the symbol associated with the declarator. */
Symbol *GetSymbol() const;
void InitFromType(const Type *base, DeclSpecs *ds);
void Print(int indent) const;
@@ -177,18 +159,24 @@ public:
/** Type qualifiers provided with the declarator. */
int typeQualifiers;
StorageClass storageClass;
/** For array declarators, this gives the declared size of the array.
Unsized arrays have arraySize == 0. */
int arraySize;
/** Symbol associated with the declarator. */
Symbol *sym;
/** Name associated with the declarator. */
std::string name;
/** Initialization expression for the variable. May be NULL. */
Expr *initExpr;
/** Type of the declarator. This is NULL until InitFromDeclSpecs() or
InitFromType() is called. */
const Type *type;
/** For function declarations, this holds the Declaration *s for the
funciton's parameters. */
function's parameters. */
std::vector<Declaration *> functionParams;
};
@@ -233,8 +221,8 @@ struct StructDeclaration {
/** Given a set of StructDeclaration instances, this returns the types of
the elements of the corresponding struct and their names. */
extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
std::vector<const Type *> *elementTypes,
std::vector<std::string> *elementNames,
std::vector<SourcePos> *elementPositions);
llvm::SmallVector<const Type *, 8> *elementTypes,
llvm::SmallVector<std::string, 8> *elementNames,
llvm::SmallVector<SourcePos, 8> *elementPositions);
#endif // ISPC_DECL_H

View File

@@ -1,3 +1,199 @@
=== v1.3.0 === (29 June 2012)
This is a major new release of ispc, with support for more compilation
targets and a number of additions to the language. As usual, the quality
of generated code has also been improved in a number of cases and a number
of small bugs have been fixed.
New targets:
* This release provides "beta" support for compiling to Intel® Xeon
Phi™ processor, code named Knights Corner, the first processor in
the Intel® Many Integrated Core Architecture. See
http://ispc.github.com/ispc.html#compiling-for-the-intel-xeon-phi-architecture
for more details on this support.
* This release also has an "avx1.1" target, which provides support for the
new instructions in the Intel Ivy Bridge microarchitecutre.
New language features:
* The foreach_active statement allows iteration over the active program
instances in a gang. (See
http://ispc.github.com/ispc.html#iteration-over-active-program-instances-foreach-active)
* foreach_unique allows iterating over subsets of program instances in a
gang that share the same value of a variable. (See
http://ispc.github.com/ispc.html#iteration-over-unique-elements-foreach-unique)
* An "unmasked" function qualifier and statement in the language allow
re-activating execution of all program instances in a gang. (See
http://ispc.github.com/ispc.html#re-establishing-the-execution-mask
Standard library updates:
* The seed_rng() function has been modified to take a "varying" seed value
when a varying RNGState is being initialized.
* An isnan() function has been added, to check for floating-point "not a
number" values.
* The float_to_srgb8() routine does high performance conversion of
floating-point color values to SRGB8 format.
Other changes:
* A number of bugfixes have been made for compiler crashes with malformed
programs.
* Floating-point comparisons are now "unordered", so that any comparison
where one of the operands is a "not a number" value returns false. (This
matches standard IEEE floating-point behavior.)
* The code generated for 'break' statements in "varying" loops has been
improved for some common cases.
* Compile time and compiler memory use have both been improved,
particularly for large input programs.
* A nubmer of bugs have been fixed in the debugging information generated
by the compiler when the "-g" command-line flag is used.
=== v1.2.2 === (20 April 2012)
This release includes a number of small additions to functionality and a
number of bugfixes. New functionality includes:
* It's now possible to forward declare structures as in C/C++: "struct
Foo;". After such a declaration, structs with pointers to "Foo" and
functions that take pointers or references to Foo structs can be declared
without the entire definition of Foo being available.
* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
corresponding to the equivalent types in C.
* The standard library now provides atomic_swap*() and
atomic_compare_exchange*() functions for void * types.
* The C++ backend has seen a number of improvements to the quality and
readability of generated code.
A number of bugs have been fixed in this release as well. The most
significant are:
* Fixed a bug where nested loops could cause a compiler crash in some
circumstances (issues #240, and #229)
* Gathers could access invlaid mamory (and cause the program to crash) in
some circumstances (#235)
* References to temporary values are now handled properly when passed to a
function that takes a reference typed parameter.
* A case where incorrect code could be generated for compile-time-constant
initializers has been fixed (#234).
=== v1.2.1 === (6 April 2012)
This release contains only minor new functionality and is mostly for many
small bugfixes and improvements to error handling and error reporting.
The new functionality that is present is:
* Significantly more efficient versions of the float / half conversion
routines are now available in the standard library, thanks to Fabian
Giesen.
* The last member of a struct can now be a zero-length array; this allows
the trick of dynamically allocating enough storage for the struct and
some number of array elements at the end of it.
Significant bugs fixed include:
* Issue #205: When a target ISA isn't specified, use the host system's
capabilities to choose a target for which it will be able to run the
generated code.
* Issues #215 and #217: Don't allocate storage for global variables that
are declared "extern".
* Issue #197: Allow NULL as a default argument value in a function
declaration.
* Issue #223: Fix bugs where taking the address of a function wouldn't work
as expected.
* Issue #224: When there are overloaded variants of a function that take
both reference and const reference parameters, give the non-const
reference preference when matching values of that underlying type.
* Issue #225: An error is issed when a varying lvalue is assigned to a
reference type (rather than crashing).
* Issue #193: Permit conversions from array types to void *, not just the
pointer type of the underlying array element.
* Issue #199: Still evaluate expressions that are cast to (void).
The documentation has also been improved, with FAQs added to clarify some
aspects of the ispc pointer model.
=== v1.2.0 === (20 March 2012)
This is a major new release of ispc, with a number of significant
improvements to functionality, performance, and compiler robustness. It
does, however, include three small changes to language syntax and semantics
that may require changes to existing programs:
* Syntax for the "launch" keyword has been cleaned up; it's now no longer
necessary to bracket the launched function call with angle brackets.
(In other words, now use "launch foo();", rather than "launch < foo() >;".
* When using pointers, the pointed-to data type is now "uniform" by
default. Use the varying keyword to specify varying pointed-to types when
needed. (i.e. "float *ptr" is a varying pointer to uniform float data,
whereas previously it was a varying pointer to varying float values.)
Use "varying float *" to specify a varying pointer to varying float data,
and so forth.
* The details of "uniform" and "varying" and how they interact with struct
types have been cleaned up. Now, when a struct type is declared, if the
struct elements don't have explicit "uniform" or "varying" qualifiers,
they are said to have "unbound" variability. When a struct type is
instantiated, any unbound variability elements inherit the variability of
the parent struct type. See http://ispc.github.com/ispc.html#struct-types
for more details.
ispc has a new language feature that makes it much easier to use the
efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
data. A new "soa<n>" qualifier can be applied to structure types to
specify an n-wide SoA version of the corresponding type. Array indexing
and pointer operations with arrays SoA types automatically handles the
two-stage indexing calculation to access the data. See
http://ispc.github.com/ispc.html#structure-of-array-types for more details.
For more efficient access of data that is still in "array of structures"
(AoS) format, ispc has a new "memory coalescing" optimization that
automatically detects series of strided loads and/or gathers that can be
transformed into a more efficient set of vector loads and shuffles. A
diagnostic is emitted when this optimization is successfully applied.
Smaller changes in this release:
* The standard library now provides memcpy(), memmove() and memset()
functions, as well as single-precision asin() and acos() functions.
* -I can now be specified on the command-line to specify a search path for
#include files.
* A number of improvements have been made to error reporting from the
parser, and a number of cases where malformed programs could cause the
compiler to crash have been fixed.
* A number of small improvements to the quality and performance of generated
code have been made, including finding more cases where 32-bit addressing
calculations can be safely done on 64-bit systems and generating better
code for initializer expressions.
=== v1.1.4 === (4 February 2012)
There are two major bugfixes for Windows in this release. First, a number

View File

@@ -1,11 +1,14 @@
#!/bin/bash
for i in ispc perfguide faq; do
rst2html.py --template=template.txt --link-stylesheet \
rst2html --template=template.txt --link-stylesheet \
--stylesheet-path=css/style.css $i.rst > $i.html
done
rst2html.py --template=template-perf.txt --link-stylesheet \
rst2html --template=template-news.txt --link-stylesheet \
--stylesheet-path=css/style.css news.rst > news.html
rst2html --template=template-perf.txt --link-stylesheet \
--stylesheet-path=css/style.css perf.rst > perf.html
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex

View File

@@ -14,11 +14,24 @@ distribution.
+ `Why are there multiple versions of exported ispc functions in the assembly output?`_
+ `How can I more easily see gathers and scatters in generated assembly?`_
* Running The Compiler
+ `Why is it required to use one of the "generic" targets with C++ output?`_
+ `Why won't the compiler generate an object file or assembly output with the "generic" targets?`_
* Language Details
+ `What is the difference between "int *foo" and "int foo[]"?`_
+ `Why are pointed-to types "uniform" by default?`_
+ `What am I getting an error about assigning a varying lvalue to a reference type?`_
* Interoperability
+ `How can I supply an initial execution mask in the call from the application?`_
+ `How can I generate a single binary executable with support for multiple instruction sets?`_
+ `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
+ `Is it possible to inline ispc functions in C/C++ code?`_
+ `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_
* Programming Techniques
@@ -26,6 +39,8 @@ distribution.
+ `How can a gang of program instances generate variable amounts of output efficiently?`_
+ `Is it possible to use ispc for explicit vector programming?`_
+ `How can I debug my ispc programs using Valgrind?`_
+ `foreach statements generate more complex assembly than I'd expect; what's going on?`_
+ `How do I launch an individual task for each active program instance?`_
Understanding ispc's Output
===========================
@@ -212,6 +227,174 @@ easier to understand:
jmp ___pseudo_scatter_base_offsets32_32 ## TAILCALL
Running The Compiler
====================
Why is it required to use one of the "generic" targets with C++ output?
-----------------------------------------------------------------------
The C++ output option transforms the provided ``ispc`` program source into
C++ code where each basic operation in the program (addition, comparison,
etc.) is represented as a function call to an as-yet-undefined function,
chaining the results of these calls together to perform the required
computations. It is then expected that the user will provide the
implementation of these functions via a header file with ``inline``
functions defined for each of these functions and then use a C++ compiler
to generate a final object file. (Examples of these headers include
``examples/intrinsics/sse4.h`` and ``examples/intrinsics/knc.h`` in the
``ispc`` distribution.)
If a target other than one of the "generic" ones is used with C++ output,
then the compiler will transform certain operations into particular code
sequences that may not be desired for the actual final target; for example,
SSE targets that don't have hardware "gather" instructions will transform a
gather into a sequence of scalar load instructions. When this in turn is
transformed to C++ code, the fact that the loads were originally a gather
is lost, and the header file of function definitions wouldn't have a chance
to map the "gather" to a target-specific operation, as the ``knc.h`` header
does, for example. Thus, the "generic" targets exist to provide basic
targets of various vector widths, without imposing any limitations on the
final target's capabilities.
Why won't the compiler generate an object file or assembly output with the "generic" targets?
---------------------------------------------------------------------------------------------
As described in the above FAQ entry, when compiling to the "generic"
targets, ``ispc`` generates vector code for the source program that
transforms every basic operation in the program (addition, comparison,
etc.) into a separate function call.
While there is no fundamental reason that the compiler couldn't generate
target-specific object code with a function call to an undefined function
for each primitive operation, doing so wouldn't actually be useful in
practice--providing definitions of these functions in a separate object
file and actually performing function calls for each of them (versus
turning them into inline function calls) would be a highly inefficient way
to run the program.
Therefore, in the interests of encouraging the use of the system,
these types of output are disallowed.
Language Details
================
What is the difference between "int \*foo" and "int foo[]"?
-----------------------------------------------------------
In C and C++, declaring a function to take a parameter ``int *foo`` and
``int foo[]`` results in the same type for the parameter. Both are
pointers to integers. In ``ispc``, these are different types. The first
one is a varying pointer to a uniform integer value in memory, while the
second results in a uniform pointer to the start of an array of varying
integer values in memory.
To understand why the first is a varying pointer to a uniform integer,
first recall that types without explicit rate qualifiers (``uniform``,
``varying``, or ``soa<>``) are ``varying`` by default. Second, recall from
the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
types without rate qualifiers are ``uniform`` by default. (This second
rule is discussed further below, in `Why are pointed-to types "uniform" by
default?`_.) The type of ``int *foo`` follows from these.
.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types
Conversely, in a function body, ``int foo[10]`` represents a declaration of
a 10-element array of varying ``int`` values. In that we'd certainly like
to be able to pass such an array to a function that takes a ``int []``
parameter, the natural type for an ``int []`` parameter is a uniform
pointer to varying integer values.
In terms of compatibility with C/C++, it's unfortunate that this
distinction exists, though any other set of rules seems to introduce more
awkwardness than this one. (Though we're interested to hear ideas to
improve these rules!).
Why are pointed-to types "uniform" by default?
----------------------------------------------
In ``ispc``, types without rate qualifiers are "varying" by default, but
types pointed to by pointers without rate qualifiers are "uniform" by
default. Why this difference?
::
int foo; // no rate qualifier, "varying int".
uniform int *foo; // pointer type has no rate qualifier, pointed-to does.
// "varying pointer to uniform int".
int *foo; // neither pointer type nor pointed-to type ("int") have
// rate qualifiers. Pointer type is varying by default,
// pointed-to is uniform. "varying pointer to uniform int".
varying int *foo; // varying pointer to varying int
The first rule, having types without rate qualifiers be varying by default,
is a default that keeps the number of "uniform" or "varying" qualifiers in
``ispc`` programs low. Most ``ispc`` programs use mostly "varying"
variables, so this rule allows most variables to be declared without also
requiring rate qualifiers.
On a related note, this rule allows many C/C++ functions to be used to
define equivalent functions in the SPMD execution model that ``ispc``
provides with little or no modification:
::
// scalar add in C/C++, SPMD/vector add in ispc
int add(int a, int b) { return a + b; }
This motivation also explains why ``uniform int *foo`` represents a varying
pointer; having pointers be varying by default if they don't have rate
qualifiers similarly helps with porting code from C/C++ to ``ispc``.
The tricker issue is why pointed-to types are "uniform" by default. In our
experience, data in memory that is accessed via pointers is most often
uniform; this generally includes all data that has been allocated and
initialized by the C/C++ application code. In practice, "varying" types are
more generally (but not exclusively) used for local data in ``ispc``
functions. Thus, making the pointed-to type uniform by default leads to
more concise code for the most common cases.
What am I getting an error about assigning a varying lvalue to a reference type?
--------------------------------------------------------------------------------
Given code like the following:
::
uniform float a[...];
int index = ...;
float &r = a[index];
``ispc`` issues the error "Initializer for reference-type variable "r" must
have a uniform lvalue type.". The underlying issue stems from how
references are represented in the code generated by ``ispc``. Recall that
``ispc`` supports both uniform and varying pointer types--a uniform pointer
points to the same location in memory for all program instances in the
gang, while a varying pointer allows each program instance to have its own
pointer value.
References are represented a pointer in the code generated by ``ispc``,
though this is generally opaque to the user; in ``ispc``, they are
specifically uniform pointers. This design decision was made so that given
code like this:
::
extern void func(float &val);
float foo = ...;
func(foo);
Then the reference would be handled efficiently as a single pointer, rather
than unnecessarily being turned into a gang-size of pointers.
However, an implication of this decision is that it's not possible for
references to refer to completely different things for each of the program
instances. (And hence the error that is issued). In cases where a unique
per-program-instance pointer is needed, a varying pointer should be used
instead of a reference.
Interoperability
================
@@ -346,6 +529,92 @@ In a similar fashion, it's possible to find out at run-time the value of
export uniform int width() { return programCount; }
Is it possible to inline ispc functions in C/C++ code?
------------------------------------------------------
If you're willing to use the ``clang`` C/C++ compiler that's part of the
LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
(and conversely, to inline C/C++ calls in ``ispc``). Doing so can provide
performance advantages when calling out to short functions written in the
"other" language. Note that you don't need to use ``clang`` to compile all
of your C/C++ code, but only for the files where you want to be able to
inline. In order to do this, you must have a full installation of LLVM
version 3.0 or later, including the ``clang`` compiler.
The basic approach is to have the various compilers emit LLVM intermediate
representation (IR) code and to then use tools from LLVM to link together
the IR from the compilers and then re-optimize it, which gives the LLVM
optimizer the opportunity to do additional inlining and cross-function
optimizations. If you have source files ``foo.ispc`` and ``foo.cpp``,
first emit LLVM IR:
::
ispc --emit-llvm -o foo_ispc.bc foo.ispc
clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
Next, link the two IR files into a single file and run the LLVM optimizer
on the result:
::
llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
And finally, generate a native object file:
::
llc -filetype=obj foo_opt.bc -o foo.o
This file can in turn be linked in with the rest of your object files when
linking your applicaiton.
(Note that if you're using the AVX instruction set, you must provide the
``-mattr=+avx`` flag to ``llc``.)
Why is it illegal to pass "varying" values from C/C++ to ispc functions?
------------------------------------------------------------------------
If any of the types in the parameter list to an exported function is
"varying" (including recursively, and members of structure types, etc.),
then ``ispc`` will issue an error and refuse to compile the function:
::
% echo "export int add(int x) { return ++x; }" | ispc
<stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo"
<stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function.
While there's no fundamental reason why this isn't possible, recall the
definition of "varying" variables: they have one value for each program
instance in the gang. As such, the number of values and amount of storage
required to represent a varying variable depends on the gang size
(i.e. ``programCount``), which can have different values depending on the
compilation target.
``ispc`` therefore prohibits passing "varying" values between the
application and the ``ispc`` program in order to prevent the
application-side code from depending on a particular gang size, in order to
encourage portability to different gang sizes. (A generally desirable
programming practice.)
For cases where the size of data is actually fixed from the application
side, the value can be passed via a pointer to a short ``uniform`` array,
as follows:
::
export void add4(uniform int ptr[4]) {
foreach (i = 0 ... 4)
ptr[i]++;
}
On the 4-wide SSE instruction set, this compiles to a single vector add
instruction (and associated move instructions), while it still also
efficiently computes the correct result on 8-wide AVX targets.
Programming Techniques
======================
@@ -480,3 +749,131 @@ you can use ``--target=sse4`` when compiling to run with ``valgrind``.
Note that ``valgrind`` does not yet support programs that use the AVX
instruction set.
foreach statements generate more complex assembly than I'd expect; what's going on?
-----------------------------------------------------------------------------------
Given a simple ``foreach`` loop like the following:
::
void foo(uniform float a[], uniform int count) {
foreach (i = 0 ... count)
a[i] *= 2;
}
the ``ispc`` compiler generates approximately 40 instructions--why isn't
the generated code simpler?
There are two main components to the code: one handles
``programCount``-sized chunks of elements of the array, and the other
handles any excess elements at the end of the array that don't completely
fill a gang. The code for the main loop is essentially what one would
expect: a vector of values are laoded from the array, the multiply is done,
and the result is stored.
::
LBB0_2: ## %foreach_full_body
movslq %edx, %rdx
vmovups (%rdi,%rdx), %ymm1
vmulps %ymm0, %ymm1, %ymm1
vmovups %ymm1, (%rdi,%rdx)
addl $32, %edx
addl $8, %eax
cmpl %ecx, %eax
jl LBB0_2
Then, there is a sequence of instructions that handles any additional
elements at the end of the array. (These instructions don't execute if
there aren't any left-over values to process, but they do lengthen the
amount of generated code.)
::
## BB#4: ## %partial_inner_only
vmovd %eax, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vpermilps $0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
vextractf128 $1, %ymm0, %xmm3
vmovd %esi, %xmm2
vmovaps LCPI0_1(%rip), %ymm1
vextractf128 $1, %ymm1, %xmm4
vpaddd %xmm4, %xmm3, %xmm3
# ....
vmulps LCPI0_0(%rip), %ymm1, %ymm1
vmaskmovps %ymm1, %ymm0, (%rdi,%rax)
If you know that the number of elements to be processed will always be an
exact multiple of the 8, 16, etc., then adding a simple assignment to
``count`` like the one below gives the compiler enough information to be
able to eliminate the code for the additional array elements.
::
void foo(uniform float a[], uniform int count) {
// This assignment doesn't change the value of count
// if it's a multiple of 16, but it gives the compiler
// insight into this fact, allowing for simpler code to
// be generated for the foreach loop.
count = (count & ~(16-1));
foreach (i = 0 ... count)
a[i] *= 2;
}
With this new version of ``foo()``, only the code for the first loop above
is generated.
How do I launch an individual task for each active program instance?
--------------------------------------------------------------------
Recall from the `discussion of "launch" in the ispc User's Guide`_ that a
``launch`` statement launches a single task corresponding to a single gang
of executing program instances, where the indices of the active program
instances are the same as were active when the ``launch`` statement
executed.
.. _discussion of "launch" in the ispc User's Guide: ispc.html#task-parallelism-launch-and-sync-statements
In some situations, it's desirable to be able to launch an individual task
for each executing program instance. For example, we might be performing
an iterative computation where a subset of the program instances determine
that an item they are responsible for requires additional processing.
::
bool itemNeedsMoreProcessing(int);
int itemNum = ...;
if (itemNeedsMoreProcessing(itemNum)) {
// do additional work
}
For performance reasons, it may be desirable to apply an entire gang's
worth of comptuation to each item that needs additional processing;
there may be available parallelism in this computation such that we'd like
to process each of the items with SPMD computation.
In this case, the ``foreach_active`` and ``unmasked`` constructs can be
applied together to accomplish this goal.
::
// do additional work
task void doWork(uniform int index);
foreach_active (index) {
unmasked {
launch doWork(extract(itemNum, index));
}
}
Recall that the body of the ``foreach_active`` loop runs once for each
active program instance, with each active program instance's
``programIndex`` value available in ``index`` in the above. In the loop,
we can re-establish an "all on" execution mask, enabling execution in all
of the program instances in the gang, such that execution in ``doWork()``
starts with all instances running. (Alternatively, the ``unmasked`` block
could be in the definition of ``doWork()``.)

File diff suppressed because it is too large Load Diff

71
docs/news.rst Normal file
View File

@@ -0,0 +1,71 @@
=========
ispc News
=========
ispc 1.3.0 is Released
----------------------
A major new version of ``ispc`` has been released. In addition to a number
of new language features, this release notably features initial support for
compiling to the Intel Xeon Phi (Many Integrated Core) architecture.
ispc 1.2.1 is Released
----------------------
This is a bugfix release, fixing approximately 20 bugs in the system and
improving error handling and error reporting. New functionality includes
very efficient float/half conversion routines thanks to Fabian
Giesen. See the `1.2.1 release notes`_ for details.
.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
ispc 1.2.0 is Released
-----------------------
A new major release was posted on March 20, 2012. This release includes
significant new functionality for cleanly handling "structure of arrays"
(SoA) data layout and a new model for how uniform and varying are handled
with structure types.
Paper on ispc To Appear in InPar 2012
-------------------------------------
A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
the `InPar 2012`_ conference. This paper describes a number of the design
features and key characteristics of the ``ispc`` implementation.
(© 2012 IEEE. Personal use of this material is permitted. Permission from
IEEE must be obtained for all other uses, in any current or future media,
including reprinting/republishing this material for advertising or
promotional purposes, creating new collective works, for resale or
redistribution to servers or lists, or reuse of any copyrighted component
of this work in other works.).
.. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
.. _InPar 2012: http://innovativeparallel.org/
ispc 1.1.4 is Released
----------------------
On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
programs, "local" atomic operations in the standard library, and a new
scalar compilation target. See the `1.1.4 release notes`_ for details.
.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
ispc 1.1.3 is Released
----------------------
With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
This release includes fixes for two important performance related issues:
the quality of code generated for "foreach" statements has been
substantially improved, and performance regression with code for "gathers"
that was introduced in v1.1.2 has been fixed in this release.
Thanks to Jean-Luc Duprat for a number of patches that improve support for
building on various platforms, and to Pierre-Antoine Lacaze for patches so
that ispc builds under MinGW.

View File

@@ -13,6 +13,7 @@ the most out of ``ispc`` in practice.
+ `Improving Control Flow Coherence With "foreach_tiled"`_
+ `Using Coherent Control Flow Constructs`_
+ `Use "uniform" Whenever Appropriate`_
+ `Use "Structure of Arrays" Layout When Possible`_
* `Tips and Techniques`_
@@ -20,6 +21,7 @@ the most out of ``ispc`` in practice.
+ `Avoid 64-bit Addressing Calculations When Possible`_
+ `Avoid Computation With 8 and 16-bit Integer Types`_
+ `Implementing Reductions Efficiently`_
+ `Using "foreach_active" Effectively`_
+ `Using Low-level Vector Tricks`_
+ `The "Fast math" Option`_
+ `"inline" Aggressively`_
@@ -247,6 +249,76 @@ but it's always best to provide the compiler with as much help as possible
to understand the actual form of your computation.
Use "Structure of Arrays" Layout When Possible
----------------------------------------------
In general, memory access performance (for both reads and writes) is best
when the running program instances access a contiguous region of memory; in
this case efficient vector load and store instructions can often be used
rather than gathers and scatters. As an example of this issue, consider an
array of a simple point datatype laid out and accessed in conventional
"array of structures" (AOS) layout:
::
struct Point { float x, y, z; };
uniform Point pts[...];
float v = pts[programIndex].x;
In the above code, the access to ``pts[programIndex].x`` accesses
non-sequential memory locations, due to the ``y`` and ``z`` values between
the desired ``x`` values in memory. A "gather" is required to get the
value of ``v``, with a corresponding decrease in performance.
If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
can be much more efficient:
::
struct Point8 { float x[8], y[8], z[8]; };
uniform Point8 pts8[...];
int majorIndex = programIndex / 8;
int minorIndex = programIndex % 8;
float v = pts8[majorIndex].x[minorIndex];
In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
before 8 ``y`` values and then 8 ``z`` values. If the gang size is 8 or
less, the access for ``v`` will have the same value of ``majorIndex`` for
all program instances and will access consecutive elements of the ``x[8]``
array with a vector load. (For larger gang sizes, two 8-wide vector loads
would be issues, which is also quite efficient.)
However, the syntax in the above code is messy; accessing SOA data in this
fashion is much less elegant than the corresponding code for accessing the
data with AOS layout. The ``soa`` qualifier in ``ispc`` can be used to
cause the corresponding transformation to be made to the ``Point`` type,
while preserving the clean syntax for data access that comes with AOS
layout:
::
soa<8> Point pts[...];
float v = pts[programIndex].x;
Thanks to having SOA layout a first-class concept in the language's type
system, it's easy to write functions that convert data between the
layouts. For example, the ``aos_to_soa`` function below converts ``count``
elements of the given ``Point`` type from AOS to 8-wide SOA layout. (It
assumes that the caller has pre-allocated sufficient space in the
``pts_soa`` output array.
::
void aos_to_soa(uniform Point pts_aos[], uniform int count,
soa<8> pts_soa[]) {
foreach (i = 0 ... count)
pts_soa[i] = pts_aos[i];
}
Analogously, a function could be written to convert back from SOA to AOS if
needed.
Tips and Techniques
===================
@@ -339,6 +411,12 @@ based on the index, it can be worth doing. See the example
``examples/volume_rendering`` in the ``ispc`` distribution for the use of
this technique in an instance where it is beneficial to performance.
Understanding Memory Read Coalescing
------------------------------------
XXXX todo
Avoid 64-bit Addressing Calculations When Possible
--------------------------------------------------
@@ -433,6 +511,43 @@ values--very efficient code in the end.
return reduce_add(sum);
}
Using "foreach_active" Effectively
----------------------------------
For high-performance code,
For example, consider this segment of code, from the introduction of
``foreach_active`` in the ispc User's Guide:
::
uniform float array[...] = { ... };
int index = ...;
foreach_active (i) {
++array[index];
}
Here, ``index`` was assumed to possibly have the same value for multiple
program instances, so the updates to ``array[index]`` are serialized by the
``foreach_active`` statement in order to not have undefined results when
``index`` values do collide.
The code generated by the compiler can be improved in this case by making
it clear that only a single element of the array is accessed by
``array[index]`` and that thus a general gather or scatter isn't required.
Specifically, by using the ``extract()`` function from the standard library
to extract the current program instance's value of ``index`` into a
``uniform`` variable and then using that to index into ``array``, as below,
more efficient code is generated.
::
foreach_active (instanceNum) {
uniform int unifIndex = extract(index, instanceNum);
++array[unifIndex];
}
Using Low-level Vector Tricks
-----------------------------
@@ -547,7 +662,7 @@ gathers happen.)
extern "C" {
void ISPCInstrument(const char *fn, const char *note,
int line, int mask);
int line, uint64_t mask);
}
This function is passed the file name of the ``ispc`` file running, a short
@@ -560,7 +675,7 @@ as follows:
::
ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
This call indicates that at the currently executing program has just
entered the function defined at line 55 of the file ``foo.ispc``, with a

66
docs/template-news.txt Normal file
View File

@@ -0,0 +1,66 @@
%(head_prefix)s
%(head)s
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-1486404-4']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
%(stylesheet)s
%(body_prefix)s
<div id="wrap">
<div id="wrap2">
<div id="header">
<h1 id="logo">Intel SPMD Program Compiler</h1>
<div id="slogan">An open-source compiler for high-performance SIMD programming on
the CPU</div>
</div>
<div id="nav">
<div id="nbar">
<ul>
<li><a href="index.html">Overview</a></li>
<li id="selected"><a href="news.html">News</a></li>
<li><a href="features.html">Features</a></li>
<li><a href="downloads.html">Downloads</a></li>
<li><a href="documentation.html">Documentation</a></li>
<li><a href="perf.html">Performance</a></li>
<li><a href="contrib.html">Contributors</a></li>
</ul>
</div>
</div>
<div id="content-wrap">
<div id="sidebar">
<div class="widgetspace">
<h1>Resources</h1>
<ul class="menu">
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
<li><a href="http://groups.google.com/group/ispc-users/">ispc
users mailing list</a></li>
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
developers mailing list</a></li>
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
<li><a href="doxygen/index.html">Doxygen</a></li>
</ul>
</div>
</div>
%(body_pre_docinfo)s
%(docinfo)s
<div id="content">
%(body)s
</div>
<div class="clearfix"></div>
<div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
<!-- Please Do Not remove this link, thank u -->
</div>
</div>
</div>
</div>
%(body_suffix)s

View File

@@ -26,10 +26,12 @@
<div id="nbar">
<ul>
<li><a href="index.html">Overview</a></li>
<li><a href="news.html">News</a></li>
<li><a href="features.html">Features</a></li>
<li><a href="downloads.html">Downloads</a></li>
<li><a href="documentation.html">Documentation</a></li>
<li id="selected"><a href="perf.html">Performance</a></li>
<li><a href="contrib.html">Contributors</a></li>
</ul>
</div>
</div>
@@ -55,7 +57,7 @@
%(body)s
</div>
<div class="clearfix"></div>
<div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
<div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
<!-- Please Do Not remove this link, thank u -->
</div>
</div>

View File

@@ -26,10 +26,12 @@
<div id="nbar">
<ul>
<li><a href="index.html">Overview</a></li>
<li><a href="news.html">News</a></li>
<li><a href="features.html">Features</a></li>
<li><a href="downloads.html">Downloads</a></li>
<li id="selected"><a href="documentation.html">Documentation</a></li>
<li><a href="perf.html">Performance</a></li>
<li><a href="contrib.html">Contributors</a></li>
</ul>
</div>
</div>
@@ -55,7 +57,7 @@
%(body)s
</div>
<div class="clearfix"></div>
<div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
<div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
<!-- Please Do Not remove this link, thank u -->
</div>
</div>

View File

@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
# This could be handy for archiving the generated documentation or
# if some version control system is used.
PROJECT_NUMBER = 1.1.4
PROJECT_NUMBER = 1.3.0
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.

View File

@@ -39,9 +39,6 @@ example implementation of this function that counts the number of times the
callback is made and records some statistics about control flow coherence
is provided in the instrument.cpp file.
*** Note: on Linux, this example currently hits an assertion in LLVM during
*** compilation
Deferred
========
@@ -76,6 +73,14 @@ This directory includes three implementations of the algorithm:
light culling and shading.
GMRES
=====
An implementation of the generalized minimal residual method for solving
sparse matrix equations.
(http://en.wikipedia.org/wiki/Generalized_minimal_residual_method)
Mandelbrot
==========
@@ -110,6 +115,13 @@ This program implements both the Black-Scholes and Binomial options pricing
models in both ispc and regular serial C++ code.
Perfbench
=========
This runs a number of microbenchmarks to measure system performance and
code generation quality.
RT
==

View File

@@ -50,7 +50,6 @@ struct Isect {
struct Sphere {
vec center;
float radius;
};
struct Plane {
@@ -83,7 +82,7 @@ static inline void vnormalize(vec &v) {
static void
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
float d = -dot(plane.p, plane.n);
float v = dot(ray.dir, plane.n);
@@ -103,7 +102,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
static inline void
ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
vec rs = ray.org - sphere.center;
float B = dot(rs, ray.dir);
@@ -148,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
static float
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
RNGState &rngstate) {
float eps = 0.0001f;
vec p, n;
@@ -204,14 +203,14 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
uniform int h, uniform int nsubsamples,
uniform float image[]) {
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
static Sphere spheres[3] = {
static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
static uniform Sphere spheres[3] = {
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
RNGState rngstate;
seed_rng(&rngstate, y0);
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
float invSamples = 1.f / nsubsamples;
foreach_tiled(y = y0 ... y1, x = 0 ... w,
@@ -269,5 +268,5 @@ static void task ao_task(uniform int width, uniform int height,
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) {
launch[h] < ao_task(w, h, nsubsamples, image) >;
launch[h] ao_task(w, h, nsubsamples, image);
}

View File

@@ -211,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
RNGState rngstate;
seed_rng(&rngstate, y0);
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
// Compute the mapping between the 'programCount'-wide program
// instances running in parallel and samples in the image.
@@ -329,5 +329,5 @@ static void task ao_task(uniform int width, uniform int height,
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) {
launch[h] < ao_task(w, h, nsubsamples, image) >;
launch[h] ao_task(w, h, nsubsamples, image);
}

View File

@@ -1,16 +1,22 @@
TASK_CXX=../tasksys.cpp
TASK_LIB=-lpthread
TASK_OBJ=tasksys.o
TASK_OBJ=objs/tasksys.o
CXX=g++
CXXFLAGS=-Iobjs/ -O2 -m64
CC=gcc
CCFLAGS=-Iobjs/ -O2 -m64
LIBS=-lm $(TASK_LIB) -lstdc++
ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
default: $(EXAMPLE)
@@ -26,12 +32,15 @@ objs/%.cpp objs/%.o objs/%.h: dirs
clean:
/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
$(EXAMPLE): $(CPP_OBJS) $(ISPC_OBJS)
$(EXAMPLE): $(OBJS)
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
objs/%.o: %.cpp dirs $(ISPC_HEADER)
$(CXX) $< $(CXXFLAGS) -c -o $@
objs/%.o: %.c dirs $(ISPC_HEADER)
$(CC) $< $(CCFLAGS) -c -o $@
objs/%.o: ../%.cpp dirs
$(CXX) $< $(CXXFLAGS) -c -o $@

View File

@@ -204,6 +204,7 @@ void WriteFrame(const char *filename, const InputData *input,
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
input->header.framebufferHeight);
fwrite(framebufferAOS, imageBytes, 1, out);
fclose(out);
lAlignedFree(framebufferAOS);
}

View File

@@ -35,35 +35,35 @@
struct InputDataArrays
{
uniform float * uniform zBuffer;
uniform unsigned int16 * uniform normalEncoded_x; // half float
uniform unsigned int16 * uniform normalEncoded_y; // half float
uniform unsigned int16 * uniform specularAmount; // half float
uniform unsigned int16 * uniform specularPower; // half float
uniform unsigned int8 * uniform albedo_x; // unorm8
uniform unsigned int8 * uniform albedo_y; // unorm8
uniform unsigned int8 * uniform albedo_z; // unorm8
uniform float * uniform lightPositionView_x;
uniform float * uniform lightPositionView_y;
uniform float * uniform lightPositionView_z;
uniform float * uniform lightAttenuationBegin;
uniform float * uniform lightColor_x;
uniform float * uniform lightColor_y;
uniform float * uniform lightColor_z;
uniform float * uniform lightAttenuationEnd;
float *zBuffer;
unsigned int16 *normalEncoded_x; // half float
unsigned int16 *normalEncoded_y; // half float
unsigned int16 *specularAmount; // half float
unsigned int16 *specularPower; // half float
unsigned int8 *albedo_x; // unorm8
unsigned int8 *albedo_y; // unorm8
unsigned int8 *albedo_z; // unorm8
float *lightPositionView_x;
float *lightPositionView_y;
float *lightPositionView_z;
float *lightAttenuationBegin;
float *lightColor_x;
float *lightColor_y;
float *lightColor_z;
float *lightAttenuationEnd;
};
struct InputHeader
{
uniform float cameraProj[4][4];
uniform float cameraNear;
uniform float cameraFar;
float cameraProj[4][4];
float cameraNear;
float cameraFar;
uniform int32 framebufferWidth;
uniform int32 framebufferHeight;
uniform int32 numLights;
uniform int32 inputDataChunkSize;
uniform int32 inputDataArrayOffsets[idaNum];
int32 framebufferWidth;
int32 framebufferHeight;
int32 numLights;
int32 inputDataChunkSize;
int32 inputDataArrayOffsets[idaNum];
};
@@ -327,8 +327,8 @@ ShadeTile(
// Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f);
@@ -339,9 +339,9 @@ ShadeTile(
// Load other G-buffer parameters
float surface_specularAmount =
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
half_to_float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
half_to_float_fast(inputData.specularPower[gBufferOffset]);
half_to_float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
@@ -514,9 +514,9 @@ RenderStatic(uniform InputHeader &inputHeader,
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
// by MIN_TILE_HEIGHT pixels.
launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
inputHeader, inputData, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b) >;
launch[num_groups] RenderTile(num_groups_x, num_groups_y,
inputHeader, inputData, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
}
@@ -575,8 +575,6 @@ SplitTileMinMax(
uniform float light_positionView_z_array[],
uniform float light_attenuationEnd_array[],
// Outputs
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
// indexing math ourselves
uniform int32 subtileIndices[],
uniform int32 subtileIndicesPitch,
uniform int32 subtileNumLights[]

View File

@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
framebuffer.clear();
reset_and_start_timer();
for (int j = 0; j < nframes; ++j)
ispc::RenderStatic(&input->header, &input->arrays,
ispc::RenderStatic(input->header, input->arrays,
VISUALIZE_LIGHT_COUNT,
framebuffer.r, framebuffer.g, framebuffer.b);
double mcycles = get_elapsed_mcycles() / nframes;

View File

@@ -23,6 +23,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
@@ -119,6 +121,14 @@ Global
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

8
examples/gmres/Makefile Normal file
View File

@@ -0,0 +1,8 @@
EXAMPLE=gmres
CPP_SRC=algorithm.cpp main.cpp matrix.cpp
CC_SRC=mmio.c
ISPC_SRC=matrix.ispc
ISPC_TARGETS=sse2,sse4-x2,avx-x2
include ../common.mk

View File

@@ -0,0 +1,231 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*===========================================================================*\
|* Includes
\*===========================================================================*/
#include "algorithm.h"
#include "stdio.h"
#include "debug.h"
/*===========================================================================*\
|* GMRES
\*===========================================================================*/
/* upper_triangular_right_solve:
* ----------------------------
* Given upper triangular matrix R and rhs vector b, solve for
* x. This "solve" ignores the rows, columns of R that are greater than the
* dimensions of x.
*/
void upper_triangular_right_solve (const DenseMatrix &R, const Vector &b, Vector &x)
{
// Dimensionality check
ASSERT(R.rows() >= b.size());
ASSERT(R.cols() >= x.size());
ASSERT(b.size() >= x.size());
int max_row = x.size() - 1;
// first solve step:
x[max_row] = b[max_row] / R(max_row, max_row);
for (int row = max_row - 1; row >= 0; row--) {
double xi = b[row];
for (int col = max_row; col > row; col--)
xi -= x[col] * R(row, col);
x[row] = xi / R(row, row);
}
}
/* create_rotation (used in gmres):
* -------------------------------
* Construct a Givens rotation to zero out the lowest non-zero entry in a partially
* factored Hessenburg matrix. Note that the previous Givens rotations should be
* applied to this column before creating a new rotation.
*/
void create_rotation (const DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
{
double a = H(col, col);
double b = H(col + 1, col);
double r;
if (b == 0) {
Cn[col] = copysign(1, a);
Sn[col] = 0;
}
else if (a == 0) {
Cn[col] = 0;
Sn[col] = copysign(1, b);
}
else {
r = sqrt(a*a + b*b);
Sn[col] = -b / r;
Cn[col] = a / r;
}
}
/* Applies the 'col'th Givens rotation stored in vectors Sn and Cn to the 'col'th
* column of the DenseMatrix M. (Previous columns don't need the rotation applied b/c
* presumeably, the first col-1 columns are already upper triangular, and so their
* entries in the col and col+1 rows are 0.)
*/
void apply_rotation (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
{
double c = Cn[col];
double s = Sn[col];
double tmp = c * H(col, col) - s * H(col+1, col);
H(col+1, col) = s * H(col, col) + c * H(col+1, col);
H(col, col) = tmp;
}
/* Applies the 'col'th Givens rotation to the vector.
*/
void apply_rotation (Vector &v, size_t col, Vector &Cn, Vector &Sn)
{
double a = v[col];
double b = v[col + 1];
double c = Cn[col];
double s = Sn[col];
v[col] = c * a - s * b;
v[col + 1] = s * a + c * b;
}
/* Applies the first 'col' Givens rotations to the newly-created column
* of H. (Leaves other columns alone.)
*/
void update_column (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
{
for (int i = 0; i < col; i++) {
double c = Cn[i];
double s = Sn[i];
double t = c * H(i,col) - s * H(i+1,col);
H(i+1, col) = s * H(i,col) + c * H(i+1,col);
H(i, col) = t;
}
}
/* After a new column has been added to the hessenburg matrix, factor it back into
* an upper-triangular matrix by:
* - applying the previous Givens rotations to the new column
* - computing the new Givens rotation to make the column upper triangluar
* - applying the new Givens rotation to the column, and
* - applying the new Givens rotation to the solution vector
*/
void update_qr_decomp (DenseMatrix &H, Vector &s, size_t col, Vector &Cn, Vector &Sn)
{
update_column( H, col, Cn, Sn);
create_rotation(H, col, Cn, Sn);
apply_rotation( H, col, Cn, Sn);
apply_rotation( s, col, Cn, Sn);
}
void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double max_err)
{
DEBUG_PRINT("gmres starting!\n");
x.zero();
ASSERT(A.rows() == A.cols());
DenseMatrix Qstar(num_iters + 1, A.rows());
DenseMatrix H(num_iters + 1, num_iters);
// arrays for storing parameters of givens rotations
Vector Sn(num_iters);
Vector Cn(num_iters);
// array for storing the rhs projected onto the hessenburg's column space
Vector G(num_iters+1);
G.zero();
double beta = b.norm();
G[0] = beta;
// temp vector, stores Aqi
Vector w(A.rows());
w.copy(b);
w.normalize();
Qstar.set_row(0, w);
int iter = 0;
Vector temp(A.rows(), false);
double rel_err;
while (iter < num_iters)
{
// w = Aqi
Qstar.row(iter, temp);
A.multiply(temp, w);
// construct ith column of H, i+1th row of Qstar:
for (int row = 0; row <= iter; row++) {
Qstar.row(row, temp);
H(row, iter) = temp.dot(w);
w.add_ax(-H(row, iter), temp);
}
H(iter+1, iter) = w.norm();
w.divide(H(iter+1, iter));
Qstar.set_row(iter+1, w);
update_qr_decomp (H, G, iter, Cn, Sn);
rel_err = fabs(G[iter+1] / beta);
if (rel_err < max_err)
break;
if (iter % 100 == 0)
DEBUG_PRINT("Iter %d: %f err\n", iter, rel_err);
iter++;
}
if (iter == num_iters) {
fprintf(stderr, "Error: gmres failed to converge in %d iterations (relative err: %f)\n", num_iters, rel_err);
exit(-1);
}
// We've reached an acceptable solution (?):
DEBUG_PRINT("gmres completed in %d iterations (rel. resid. %f, max %f)\n", num_iters, rel_err, max_err);
Vector y(iter+1);
upper_triangular_right_solve(H, G, y);
for (int i = 0; i < iter + 1; i++) {
Qstar.row(i, temp);
x.add_ax(y[i], temp);
}
}

View File

@@ -0,0 +1,50 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ALGORITHM_H__
#define __ALGORITHM_H__
#include "matrix.h"
/* Generalized Minimal Residual Method:
* -----------------------------------
* Takes a square matrix and an rhs and uses GMRES to find an estimate for x.
* The specified error is relative.
*/
void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double err);
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

55
examples/gmres/debug.h Normal file
View File

@@ -0,0 +1,55 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __DEBUG_H__
#define __DEBUG_H__
#include <cassert>
/**************************************************************\
| Macros
\**************************************************************/
#define DEBUG
#ifdef DEBUG
#define ASSERT(expr) assert(expr)
#define DEBUG_PRINT(...) printf(__VA_ARGS__)
#else
#define ASSERT(expr)
#define DEBUG_PRINT(...)
#endif
#endif

79
examples/gmres/main.cpp Normal file
View File

@@ -0,0 +1,79 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "matrix.h"
#include "algorithm.h"
#include "util.h"
#include <cmath>
#include "../timing.h"
int main (int argc, char **argv)
{
if (argc < 4) {
printf("usage: %s <input-matrix> <input-rhs> <output-file>\n", argv[0]);
return -1;
}
double gmres_cycles;
DEBUG_PRINT("Loading A...\n");
Matrix *A = CRSMatrix::matrix_from_mtf(argv[1]);
if (A == NULL)
return -1;
DEBUG_PRINT("... size: %lu\n", A->cols());
DEBUG_PRINT("Loading b...\n");
Vector *b = Vector::vector_from_mtf(argv[2]);
if (b == NULL)
return -1;
Vector x(A->cols());
DEBUG_PRINT("Beginning gmres...\n");
gmres(*A, *b, x, A->cols() / 2, .01);
// Write result out to file
x.to_mtf(argv[argc-1]);
// Compute residual (double-check)
#ifdef DEBUG
Vector bprime(b->size());
A->multiply(x, bprime);
Vector resid(bprime.size(), &(bprime[0]));
resid.subtract(*b);
DEBUG_PRINT("residual error check: %lg\n", resid.norm() / b->norm());
#endif
// Print profiling results
DEBUG_PRINT("-- Total mcycles to solve : %.03f --\n", gmres_cycles);
}

246
examples/gmres/matrix.cpp Normal file
View File

@@ -0,0 +1,246 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**************************************************************\
| Includes
\**************************************************************/
#include "matrix.h"
#include "matrix_ispc.h"
extern "C" {
#include "mmio.h"
}
/**************************************************************\
| DenseMatrix methods
\**************************************************************/
void DenseMatrix::multiply (const Vector &v, Vector &r) const
{
// Dimensionality check
ASSERT(v.size() == cols());
ASSERT(r.size() == rows());
for (int i = 0; i < rows(); i++)
r[i] = v.dot(entries + i * num_cols);
}
const Vector *DenseMatrix::row (size_t row) const {
return new Vector(num_cols, entries + row * num_cols, true);
}
void DenseMatrix::row (size_t row, Vector &r) {
r.entries = entries + row * cols();
r._size = cols();
}
void DenseMatrix::set_row(size_t row, const Vector &v)
{
ASSERT(v.size() == num_cols);
memcpy(entries + row * num_cols, v.entries, num_cols * sizeof(double));
}
/**************************************************************\
| CRSMatrix Methods
\**************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <algorithm>
struct entry {
int row;
int col;
double val;
};
bool compare_entries(struct entry i, struct entry j) {
if (i.row < j.row)
return true;
if (i.row > j.row)
return false;
return i.col < j.col;
}
#define ERR_OUT(...) { fprintf(stderr, __VA_ARGS__); return NULL; }
CRSMatrix *CRSMatrix::matrix_from_mtf (char *path) {
FILE *f;
MM_typecode matcode;
int m, n, nz;
if ((f = fopen(path, "r")) == NULL)
ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
if (mm_read_banner(f, &matcode) != 0)
ERR_OUT("Error: Could not process Matrix Market banner.\n");
if (mm_is_complex(matcode))
ERR_OUT("Error: Application does not support complex numbers.\n")
if (mm_is_dense(matcode))
ERR_OUT("Error: supplied matrix is dense (should be sparse.)\n");
if (!mm_is_matrix(matcode))
ERR_OUT("Error: %s does not encode a matrix.\n", path)
if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
ERR_OUT("Error: could not read matrix size from file.\n");
if (m != n)
ERR_OUT("Error: Application does not support non-square matrices.");
std::vector<struct entry> entries;
entries.resize(nz);
for (int i = 0; i < nz; i++) {
fscanf(f, "%d %d %lg\n", &entries[i].row, &entries[i].col, &entries[i].val);
// Adjust from 1-based to 0-based
entries[i].row--;
entries[i].col--;
}
sort(entries.begin(), entries.end(), compare_entries);
CRSMatrix *M = new CRSMatrix(m, n, nz);
int cur_row = -1;
for (int i = 0; i < nz; i++) {
while (entries[i].row > cur_row)
M->row_offsets[++cur_row] = i;
M->entries[i] = entries[i].val;
M->columns[i] = entries[i].col;
}
return M;
}
Vector *Vector::vector_from_mtf (char *path) {
FILE *f;
MM_typecode matcode;
int m, n, nz;
if ((f = fopen(path, "r")) == NULL)
ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
if (mm_read_banner(f, &matcode) != 0)
ERR_OUT("Error: Could not process Matrix Market banner.\n");
if (mm_is_complex(matcode))
ERR_OUT("Error: Application does not support complex numbers.\n")
if (mm_is_dense(matcode)) {
if (mm_read_mtx_array_size(f, &m, &n) != 0)
ERR_OUT("Error: could not read matrix size from file.\n");
} else {
if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
ERR_OUT("Error: could not read matrix size from file.\n");
}
if (n != 1)
ERR_OUT("Error: %s does not describe a vector.\n", path);
Vector *x = new Vector(m);
if (mm_is_dense(matcode)) {
double val;
for (int i = 0; i < m; i++) {
fscanf(f, "%lg\n", &val);
(*x)[i] = val;
}
}
else {
x->zero();
double val;
int row;
int col;
for (int i = 0; i < nz; i++) {
fscanf(f, "%d %d %lg\n", &row, &col, &val);
(*x)[row-1] = val;
}
}
return x;
}
#define ERR(...) { fprintf(stderr, __VA_ARGS__); exit(-1); }
void Vector::to_mtf (char *path) {
FILE *f;
MM_typecode matcode;
mm_initialize_typecode(&matcode);
mm_set_matrix(&matcode);
mm_set_real(&matcode);
mm_set_dense(&matcode);
mm_set_general(&matcode);
if ((f = fopen(path, "w")) == NULL)
ERR("Error: cannot open/write to %s\n", path);
mm_write_banner(f, matcode);
mm_write_mtx_array_size(f, size(), 1);
for (int i = 0; i < size(); i++)
fprintf(f, "%lg\n", entries[i]);
fclose(f);
}
void CRSMatrix::multiply (const Vector &v, Vector &r) const
{
ASSERT(v.size() == cols());
ASSERT(r.size() == rows());
for (int row = 0; row < rows(); row++)
{
int row_offset = row_offsets[row];
int next_offset = ((row + 1 == rows()) ? _nonzeroes : row_offsets[row + 1]);
double sum = 0;
for (int i = row_offset; i < next_offset; i++)
{
sum += v[columns[i]] * entries[i];
}
r[row] = sum;
}
}
void CRSMatrix::zero ( )
{
entries.clear();
row_offsets.clear();
columns.clear();
_nonzeroes = 0;
}

279
examples/gmres/matrix.h Normal file
View File

@@ -0,0 +1,279 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __MATRIX_H__
#define __MATRIX_H__
/**************************************************************\
| Includes
\**************************************************************/
#include <cstring> // size_t
#include <cstdlib> // malloc, memcpy, etc.
#include <cmath> // sqrt
#include <vector>
#include "debug.h"
#include "matrix_ispc.h"
class DenseMatrix;
/**************************************************************\
| Vector class
\**************************************************************/
class Vector {
public:
static Vector *vector_from_mtf(char *path);
void to_mtf (char *path);
Vector(size_t size, bool alloc_mem=true)
{
shared_ptr = false;
_size = size;
if (alloc_mem)
entries = (double *) malloc(sizeof(double) * _size);
else {
shared_ptr = true;
entries = NULL;
}
}
Vector(size_t size, double *content, bool share_ptr=false)
{
_size = size;
if (share_ptr) {
entries = content;
shared_ptr = true;
}
else {
shared_ptr = false;
entries = (double *) malloc(sizeof(double) * _size);
memcpy(entries, content, sizeof(double) * _size);
}
}
~Vector() { if (!shared_ptr) free(entries); }
const double & operator [] (size_t index) const
{
ASSERT(index < _size);
return *(entries + index);
}
double &operator [] (size_t index)
{
ASSERT(index < _size);
return *(entries + index);
}
bool operator == (const Vector &v) const
{
if (v.size() != _size)
return false;
for (int i = 0; i < _size; i++)
if (entries[i] != v[i])
return false;
return true;
}
size_t size() const {return _size; }
double dot (const Vector &b) const
{
ASSERT(b.size() == this->size());
return ispc::vector_dot(entries, b.entries, size());
}
double dot (const double * const b) const
{
return ispc::vector_dot(entries, b, size());
}
void zero ()
{
ispc::zero(entries, size());
}
double norm () const { return sqrtf(dot(entries)); }
void normalize () { this->divide(this->norm()); }
void add (const Vector &a)
{
ASSERT(size() == a.size());
ispc::vector_add(entries, a.entries, size());
}
void subtract (const Vector &s)
{
ASSERT(size() == s.size());
ispc::vector_sub(entries, s.entries, size());
}
void multiply (double scalar)
{
ispc::vector_mult(entries, scalar, size());
}
void divide (double scalar)
{
ispc::vector_div(entries, scalar, size());
}
// Note: x may be longer than *(this)
void add_ax (double a, const Vector &x) {
ASSERT(x.size() >= size());
ispc::vector_add_ax(entries, a, x.entries, size());
}
// Note that copy only copies the first size() elements of the
// supplied vector, i.e. the supplied vector can be longer than
// this one. This is useful in least squares calculations.
void copy (const Vector &other) {
ASSERT(other.size() >= size());
memcpy(entries, other.entries, size() * sizeof(double));
}
friend class DenseMatrix;
private:
size_t _size;
bool shared_ptr;
double *entries;
};
/**************************************************************\
| Matrix base class
\**************************************************************/
class Matrix {
friend class Vector;
public:
Matrix(size_t size_r, size_t size_c)
{
num_rows = size_r;
num_cols = size_c;
}
~Matrix(){}
size_t rows() const { return num_rows; }
size_t cols() const { return num_cols; }
virtual void multiply (const Vector &v, Vector &r) const = 0;
virtual void zero () = 0;
protected:
size_t num_rows;
size_t num_cols;
};
/**************************************************************\
| DenseMatrix class
\**************************************************************/
class DenseMatrix : public Matrix {
friend class Vector;
public:
DenseMatrix(size_t size_r, size_t size_c) : Matrix(size_r, size_c)
{
entries = (double *) malloc(size_r * size_c * sizeof(double));
}
DenseMatrix(size_t size_r, size_t size_c, const double *content) : Matrix (size_r, size_c)
{
entries = (double *) malloc(size_r * size_c * sizeof(double));
memcpy(entries, content, size_r * size_c * sizeof(double));
}
virtual void multiply (const Vector &v, Vector &r) const;
double &operator () (unsigned int r, unsigned int c)
{
return *(entries + r * num_cols + c);
}
const double &operator () (unsigned int r, unsigned int c) const
{
return *(entries + r * num_cols + c);
}
const Vector *row(size_t row) const;
void row(size_t row, Vector &r);
void set_row(size_t row, const Vector &v);
virtual void zero() { ispc::zero(entries, rows() * cols()); }
void copy (const DenseMatrix &other)
{
ASSERT(rows() == other.rows());
ASSERT(cols() == other.cols());
memcpy(entries, other.entries, rows() * cols() * sizeof(double));
}
private:
double *entries;
bool shared_ptr;
};
/**************************************************************\
| CSRMatrix (compressed row storage, a sparse matrix format)
\**************************************************************/
class CRSMatrix : public Matrix {
public:
CRSMatrix (size_t size_r, size_t size_c, size_t nonzeroes) :
Matrix(size_r, size_c)
{
_nonzeroes = nonzeroes;
entries.resize(nonzeroes);
columns.resize(nonzeroes);
row_offsets.resize(size_r);
}
virtual void multiply(const Vector &v, Vector &r) const;
virtual void zero();
static CRSMatrix *matrix_from_mtf (char *path);
private:
unsigned int _nonzeroes;
std::vector<double> entries;
std::vector<int> row_offsets;
std::vector<int> columns;
};
#endif

122
examples/gmres/matrix.ispc Normal file
View File

@@ -0,0 +1,122 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**************************************************************\
| General
\**************************************************************/
export void zero (uniform double data[],
uniform int size)
{
foreach (i = 0 ... size)
data[i] = 0.0;
}
/**************************************************************\
| Vector helpers
\**************************************************************/
export void vector_add (uniform double a[],
const uniform double b[],
const uniform int size)
{
foreach (i = 0 ... size)
a[i] += b[i];
}
export void vector_sub (uniform double a[],
const uniform double b[],
const uniform int size)
{
foreach (i = 0 ... size)
a[i] -= b[i];
}
export void vector_mult (uniform double a[],
const uniform double b,
const uniform int size)
{
foreach (i = 0 ... size)
a[i] *= b;
}
export void vector_div (uniform double a[],
const uniform double b,
const uniform int size)
{
foreach (i = 0 ... size)
a[i] /= b;
}
export void vector_add_ax (uniform double r[],
const uniform double a,
const uniform double x[],
const uniform int size)
{
foreach (i = 0 ... size)
r[i] += a * x[i];
}
export uniform double vector_dot (const uniform double a[],
const uniform double b[],
const uniform int size)
{
varying double sum = 0.0;
foreach (i = 0 ... size)
sum += a[i] * b[i];
return reduce_add(sum);
}
/**************************************************************\
| Matrix helpers
\**************************************************************/
export void sparse_multiply (const uniform double entries[],
const uniform double columns[],
const uniform double row_offsets[],
const uniform int rows,
const uniform int cols,
const uniform int nonzeroes,
const uniform double v[],
uniform double r[])
{
foreach (row = 0 ... rows) {
int row_offset = row_offsets[row];
int next_offset = ((row + 1 == rows) ? nonzeroes : row_offsets[row+1]);
double sum = 0;
for (int j = row_offset; j < next_offset; j++)
sum += v[columns[j]] * entries[j];
r[row] = sum;
}
}

511
examples/gmres/mmio.c Normal file
View File

@@ -0,0 +1,511 @@
/*
* Matrix Market I/O library for ANSI C
*
* See http://math.nist.gov/MatrixMarket for details.
*
*
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include "mmio.h"
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
double **val_, int **I_, int **J_)
{
FILE *f;
MM_typecode matcode;
int M, N, nz;
int i;
double *val;
int *I, *J;
if ((f = fopen(fname, "r")) == NULL)
return -1;
if (mm_read_banner(f, &matcode) != 0)
{
printf("mm_read_unsymetric: Could not process Matrix Market banner ");
printf(" in file [%s]\n", fname);
return -1;
}
if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
mm_is_sparse(matcode)))
{
fprintf(stderr, "Sorry, this application does not support ");
fprintf(stderr, "Market Market type: [%s]\n",
mm_typecode_to_str(matcode));
return -1;
}
/* find out size of sparse matrix: M, N, nz .... */
if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
{
fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
return -1;
}
*M_ = M;
*N_ = N;
*nz_ = nz;
/* reseve memory for matrices */
I = (int *) malloc(nz * sizeof(int));
J = (int *) malloc(nz * sizeof(int));
val = (double *) malloc(nz * sizeof(double));
*val_ = val;
*I_ = I;
*J_ = J;
/* NOTE: when reading in doubles, ANSI C requires the use of the "l" */
/* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
/* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */
for (i=0; i<nz; i++)
{
fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
I[i]--; /* adjust from 1-based to 0-based */
J[i]--;
}
fclose(f);
return 0;
}
int mm_is_valid(MM_typecode matcode)
{
if (!mm_is_matrix(matcode)) return 0;
if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) ||
mm_is_skew(matcode))) return 0;
return 1;
}
int mm_read_banner(FILE *f, MM_typecode *matcode)
{
char line[MM_MAX_LINE_LENGTH];
char banner[MM_MAX_TOKEN_LENGTH];
char mtx[MM_MAX_TOKEN_LENGTH];
char crd[MM_MAX_TOKEN_LENGTH];
char data_type[MM_MAX_TOKEN_LENGTH];
char storage_scheme[MM_MAX_TOKEN_LENGTH];
char *p;
mm_clear_typecode(matcode);
if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
return MM_PREMATURE_EOF;
if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type,
storage_scheme) != 5)
return MM_PREMATURE_EOF;
for (p=mtx; *p!='\0'; *p=tolower(*p),p++); /* convert to lower case */
for (p=crd; *p!='\0'; *p=tolower(*p),p++);
for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
/* check for banner */
if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
return MM_NO_HEADER;
/* first field should be "mtx" */
if (strcmp(mtx, MM_MTX_STR) != 0)
return MM_UNSUPPORTED_TYPE;
mm_set_matrix(matcode);
/* second field describes whether this is a sparse matrix (in coordinate
storgae) or a dense array */
if (strcmp(crd, MM_SPARSE_STR) == 0)
mm_set_sparse(matcode);
else
if (strcmp(crd, MM_DENSE_STR) == 0)
mm_set_dense(matcode);
else
return MM_UNSUPPORTED_TYPE;
/* third field */
if (strcmp(data_type, MM_REAL_STR) == 0)
mm_set_real(matcode);
else
if (strcmp(data_type, MM_COMPLEX_STR) == 0)
mm_set_complex(matcode);
else
if (strcmp(data_type, MM_PATTERN_STR) == 0)
mm_set_pattern(matcode);
else
if (strcmp(data_type, MM_INT_STR) == 0)
mm_set_integer(matcode);
else
return MM_UNSUPPORTED_TYPE;
/* fourth field */
if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
mm_set_general(matcode);
else
if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
mm_set_symmetric(matcode);
else
if (strcmp(storage_scheme, MM_HERM_STR) == 0)
mm_set_hermitian(matcode);
else
if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
mm_set_skew(matcode);
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
{
if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
{
char line[MM_MAX_LINE_LENGTH];
int num_items_read;
/* set return null parameter values, in case we exit with errors */
*M = *N = *nz = 0;
/* now continue scanning until you reach the end-of-comments */
do
{
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
return MM_PREMATURE_EOF;
}while (line[0] == '%');
/* line[] is either blank or has M,N, nz */
if (sscanf(line, "%d %d %d", M, N, nz) == 3)
return 0;
else
do
{
num_items_read = fscanf(f, "%d %d %d", M, N, nz);
if (num_items_read == EOF) return MM_PREMATURE_EOF;
}
while (num_items_read != 3);
return 0;
}
int mm_read_mtx_array_size(FILE *f, int *M, int *N)
{
char line[MM_MAX_LINE_LENGTH];
int num_items_read;
/* set return null parameter values, in case we exit with errors */
*M = *N = 0;
/* now continue scanning until you reach the end-of-comments */
do
{
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
return MM_PREMATURE_EOF;
}while (line[0] == '%');
/* line[] is either blank or has M,N, nz */
if (sscanf(line, "%d %d", M, N) == 2)
return 0;
else /* we have a blank line */
do
{
num_items_read = fscanf(f, "%d %d", M, N);
if (num_items_read == EOF) return MM_PREMATURE_EOF;
}
while (num_items_read != 2);
return 0;
}
int mm_write_mtx_array_size(FILE *f, int M, int N)
{
if (fprintf(f, "%d %d\n", M, N) != 2)
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
/*-------------------------------------------------------------------------*/
/******************************************************************/
/* use when I[], J[], and val[]J, and val[] are already allocated */
/******************************************************************/
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode)
{
int i;
if (mm_is_complex(matcode))
{
for (i=0; i<nz; i++)
if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
!= 4) return MM_PREMATURE_EOF;
}
else if (mm_is_real(matcode))
{
for (i=0; i<nz; i++)
{
if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
!= 3) return MM_PREMATURE_EOF;
}
}
else if (mm_is_pattern(matcode))
{
for (i=0; i<nz; i++)
if (fscanf(f, "%d %d", &I[i], &J[i])
!= 2) return MM_PREMATURE_EOF;
}
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
double *real, double *imag, MM_typecode matcode)
{
if (mm_is_complex(matcode))
{
if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
!= 4) return MM_PREMATURE_EOF;
}
else if (mm_is_real(matcode))
{
if (fscanf(f, "%d %d %lg\n", I, J, real)
!= 3) return MM_PREMATURE_EOF;
}
else if (mm_is_pattern(matcode))
{
if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
}
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
/************************************************************************
mm_read_mtx_crd() fills M, N, nz, array of values, and return
type code, e.g. 'MCRS'
if matrix is complex, values[] is of size 2*nz,
(nz pairs of real/imaginary values)
************************************************************************/
int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
double **val, MM_typecode *matcode)
{
int ret_code;
FILE *f;
if (strcmp(fname, "stdin") == 0) f=stdin;
else
if ((f = fopen(fname, "r")) == NULL)
return MM_COULD_NOT_READ_FILE;
if ((ret_code = mm_read_banner(f, matcode)) != 0)
return ret_code;
if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) &&
mm_is_matrix(*matcode)))
return MM_UNSUPPORTED_TYPE;
if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
return ret_code;
*I = (int *) malloc(*nz * sizeof(int));
*J = (int *) malloc(*nz * sizeof(int));
*val = NULL;
if (mm_is_complex(*matcode))
{
*val = (double *) malloc(*nz * 2 * sizeof(double));
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
else if (mm_is_real(*matcode))
{
*val = (double *) malloc(*nz * sizeof(double));
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
else if (mm_is_pattern(*matcode))
{
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
if (f != stdin) fclose(f);
return 0;
}
int mm_write_banner(FILE *f, MM_typecode matcode)
{
char *str = mm_typecode_to_str(matcode);
int ret_code;
ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
free(str);
if (ret_code !=2 )
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode)
{
FILE *f;
int i;
if (strcmp(fname, "stdout") == 0)
f = stdout;
else
if ((f = fopen(fname, "w")) == NULL)
return MM_COULD_NOT_WRITE_FILE;
/* print banner followed by typecode */
fprintf(f, "%s ", MatrixMarketBanner);
fprintf(f, "%s\n", mm_typecode_to_str(matcode));
/* print matrix sizes and nonzeros */
fprintf(f, "%d %d %d\n", M, N, nz);
/* print values */
if (mm_is_pattern(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d\n", I[i], J[i]);
else
if (mm_is_real(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
else
if (mm_is_complex(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i],
val[2*i+1]);
else
{
if (f != stdout) fclose(f);
return MM_UNSUPPORTED_TYPE;
}
if (f !=stdout) fclose(f);
return 0;
}
/**
* Create a new copy of a string s. mm_strdup() is a common routine, but
* not part of ANSI C, so it is included here. Used by mm_typecode_to_str().
*
*/
char *mm_strdup(const char *s)
{
int len = strlen(s);
char *s2 = (char *) malloc((len+1)*sizeof(char));
return strcpy(s2, s);
}
char *mm_typecode_to_str(MM_typecode matcode)
{
char buffer[MM_MAX_LINE_LENGTH];
char *types[4];
char *mm_strdup(const char *);
int error =0;
/* check for MTX type */
if (mm_is_matrix(matcode))
types[0] = MM_MTX_STR;
else
error=1;
/* check for CRD or ARR matrix */
if (mm_is_sparse(matcode))
types[1] = MM_SPARSE_STR;
else
if (mm_is_dense(matcode))
types[1] = MM_DENSE_STR;
else
return NULL;
/* check for element data type */
if (mm_is_real(matcode))
types[2] = MM_REAL_STR;
else
if (mm_is_complex(matcode))
types[2] = MM_COMPLEX_STR;
else
if (mm_is_pattern(matcode))
types[2] = MM_PATTERN_STR;
else
if (mm_is_integer(matcode))
types[2] = MM_INT_STR;
else
return NULL;
/* check for symmetry type */
if (mm_is_general(matcode))
types[3] = MM_GENERAL_STR;
else
if (mm_is_symmetric(matcode))
types[3] = MM_SYMM_STR;
else
if (mm_is_hermitian(matcode))
types[3] = MM_HERM_STR;
else
if (mm_is_skew(matcode))
types[3] = MM_SKEW_STR;
else
return NULL;
sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
return mm_strdup(buffer);
}

135
examples/gmres/mmio.h Normal file
View File

@@ -0,0 +1,135 @@
/*
* Matrix Market I/O library for ANSI C
*
* See http://math.nist.gov/MatrixMarket for details.
*
*
*/
#ifndef MM_IO_H
#define MM_IO_H
#define MM_MAX_LINE_LENGTH 1025
#define MatrixMarketBanner "%%MatrixMarket"
#define MM_MAX_TOKEN_LENGTH 64
typedef char MM_typecode[4];
#include <stdio.h>
char *mm_typecode_to_str(MM_typecode matcode);
int mm_read_banner(FILE *f, MM_typecode *matcode);
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
int mm_read_mtx_array_size(FILE *f, int *M, int *N);
int mm_write_banner(FILE *f, MM_typecode matcode);
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
int mm_write_mtx_array_size(FILE *f, int M, int N);
/********************* MM_typecode query fucntions ***************************/
#define mm_is_matrix(typecode) ((typecode)[0]=='M')
#define mm_is_sparse(typecode) ((typecode)[1]=='C')
#define mm_is_coordinate(typecode)((typecode)[1]=='C')
#define mm_is_dense(typecode) ((typecode)[1]=='A')
#define mm_is_array(typecode) ((typecode)[1]=='A')
#define mm_is_complex(typecode) ((typecode)[2]=='C')
#define mm_is_real(typecode) ((typecode)[2]=='R')
#define mm_is_pattern(typecode) ((typecode)[2]=='P')
#define mm_is_integer(typecode) ((typecode)[2]=='I')
#define mm_is_symmetric(typecode)((typecode)[3]=='S')
#define mm_is_general(typecode) ((typecode)[3]=='G')
#define mm_is_skew(typecode) ((typecode)[3]=='K')
#define mm_is_hermitian(typecode)((typecode)[3]=='H')
int mm_is_valid(MM_typecode matcode); /* too complex for a macro */
/********************* MM_typecode modify fucntions ***************************/
#define mm_set_matrix(typecode) ((*typecode)[0]='M')
#define mm_set_coordinate(typecode) ((*typecode)[1]='C')
#define mm_set_array(typecode) ((*typecode)[1]='A')
#define mm_set_dense(typecode) mm_set_array(typecode)
#define mm_set_sparse(typecode) mm_set_coordinate(typecode)
#define mm_set_complex(typecode)((*typecode)[2]='C')
#define mm_set_real(typecode) ((*typecode)[2]='R')
#define mm_set_pattern(typecode)((*typecode)[2]='P')
#define mm_set_integer(typecode)((*typecode)[2]='I')
#define mm_set_symmetric(typecode)((*typecode)[3]='S')
#define mm_set_general(typecode)((*typecode)[3]='G')
#define mm_set_skew(typecode) ((*typecode)[3]='K')
#define mm_set_hermitian(typecode)((*typecode)[3]='H')
#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
(*typecode)[2]=' ',(*typecode)[3]='G')
#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
/********************* Matrix Market error codes ***************************/
#define MM_COULD_NOT_READ_FILE 11
#define MM_PREMATURE_EOF 12
#define MM_NOT_MTX 13
#define MM_NO_HEADER 14
#define MM_UNSUPPORTED_TYPE 15
#define MM_LINE_TOO_LONG 16
#define MM_COULD_NOT_WRITE_FILE 17
/******************** Matrix Market internal definitions ********************
MM_matrix_typecode: 4-character sequence
ojbect sparse/ data storage
dense type scheme
string position: [0] [1] [2] [3]
Matrix typecode: M(atrix) C(oord) R(eal) G(eneral)
A(array) C(omplex) H(ermitian)
P(attern) S(ymmetric)
I(nteger) K(kew)
***********************************************************************/
#define MM_MTX_STR "matrix"
#define MM_ARRAY_STR "array"
#define MM_DENSE_STR "array"
#define MM_COORDINATE_STR "coordinate"
#define MM_SPARSE_STR "coordinate"
#define MM_COMPLEX_STR "complex"
#define MM_REAL_STR "real"
#define MM_INT_STR "integer"
#define MM_GENERAL_STR "general"
#define MM_SYMM_STR "symmetric"
#define MM_HERM_STR "hermitian"
#define MM_SKEW_STR "skew-symmetric"
#define MM_PATTERN_STR "pattern"
/* high level routines */
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode);
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode);
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
MM_typecode matcode);
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
double **val_, int **I_, int **J_);
#endif

53
examples/gmres/util.h Normal file
View File

@@ -0,0 +1,53 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __UTIL_H__
#define __UTIL_H__
#include <stdio.h>
#include "matrix.h"
inline void printMatrix (DenseMatrix &M, const char *name) {
printf("Matrix %s:\n", name);
for (int row = 0; row < M.rows(); row++) {
printf("row %2d: ", row + 1);
for (int col = 0; col < M.cols(); col++)
printf("%6f ", M(row, col));
printf("\n");
}
printf("\n");
}
#endif

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
struct __vec16_i1 {
__vec16_i1() { }
__vec16_i1(const uint16_t &vv) : v(vv) { }
__vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -193,13 +194,22 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
return ret; \
}
#define CMP_OP(TYPE, CAST, NAME, OP) \
static FORCEINLINE __vec16_i1 NAME(TYPE a, TYPE b) { \
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
__vec16_i1 ret; \
ret.v = 0; \
for (int i = 0; i < 16; ++i) \
ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \
return ret; \
} \
static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \
__vec16_i1 mask) { \
__vec16_i1 ret; \
ret.v = 0; \
for (int i = 0; i < 16; ++i) \
ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \
ret.v &= mask.v; \
return ret; \
}
#define INSERT_EXTRACT(VTYPE, STYPE) \
@@ -211,14 +221,16 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
}
#define LOAD_STORE(VTYPE, STYPE) \
static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
template <int ALIGN> \
static FORCEINLINE VTYPE __load(const VTYPE *p) { \
STYPE *ptr = (STYPE *)p; \
VTYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = ptr[i]; \
return ret; \
} \
static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) { \
template <int ALIGN> \
static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \
STYPE *ptr = (STYPE *)p; \
for (int i = 0; i < 16; ++i) \
ptr[i] = v.v[i]; \
@@ -259,13 +271,29 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
return ret; \
}
#define SMEAR(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
VTYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = v; \
return ret; \
} \
#define SMEAR(VTYPE, NAME, STYPE) \
template <class RetVecType> VTYPE __smear_##NAME(STYPE); \
template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) { \
VTYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = v; \
return ret; \
}
#define SETZERO(VTYPE, NAME) \
template <class RetVecType> VTYPE __setzero_##NAME(); \
template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() { \
VTYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = 0; \
return ret; \
}
#define UNDEF(VTYPE, NAME) \
template <class RetVecType> VTYPE __undef_##NAME(); \
template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() { \
return VTYPE(); \
}
#define BROADCAST(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \
@@ -311,11 +339,23 @@ INSERT_EXTRACT(__vec1_d, double)
///////////////////////////////////////////////////////////////////////////
// mask ops
static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
return mask.v;
static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
return (uint64_t)mask.v;
}
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
static FORCEINLINE bool __any(__vec16_i1 mask) {
return (mask.v!=0);
}
static FORCEINLINE bool __all(__vec16_i1 mask) {
return (mask.v==0xFFFF);
}
static FORCEINLINE bool __none(__vec16_i1 mask) {
return (mask.v==0);
}
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
__vec16_i1 r;
r.v = (a.v & b.v) | (~a.v & ~b.v);
return r;
@@ -339,6 +379,24 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
return r;
}
static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
__vec16_i1 r;
r.v = ~v.v;
return r;
}
static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
__vec16_i1 r;
r.v = ~a.v & b.v;
return r;
}
static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
__vec16_i1 r;
r.v = a.v & ~b.v;
return r;
}
static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a,
__vec16_i1 b) {
__vec16_i1 r;
@@ -362,18 +420,36 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
vec->v |= (1 << index);
}
static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p, int align) {
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
uint16_t *ptr = (uint16_t *)p;
__vec16_i1 r;
r.v = *ptr;
return r;
}
static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
uint16_t *ptr = (uint16_t *)p;
*ptr = v.v;
}
template <class RetVecType> __vec16_i1 __smear_i1(int i);
template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int v) {
return __vec16_i1(v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v);
}
template <class RetVecType> __vec16_i1 __setzero_i1();
template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0);
}
template <class RetVecType> __vec16_i1 __undef_i1();
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
return __vec16_i1();
}
///////////////////////////////////////////////////////////////////////////
// int8
@@ -398,20 +474,22 @@ SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
CMP_OP(__vec16_i8, int8_t, __equal, ==)
CMP_OP(__vec16_i8, int8_t, __not_equal, !=)
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec16_i8, int8_t, __signed_less_than, <)
CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i8, int8_t, __signed_greater_than, >)
CMP_OP(__vec16_i8, i8, int8_t, __equal, ==)
CMP_OP(__vec16_i8, i8, int8_t, __not_equal, !=)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_than, <)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >)
SELECT(__vec16_i8)
INSERT_EXTRACT(__vec16_i8, int8_t)
SMEAR(__vec16_i8, i8, int8_t)
SETZERO(__vec16_i8, i8)
UNDEF(__vec16_i8, i8)
BROADCAST(__vec16_i8, i8, int8_t)
ROTATE(__vec16_i8, i8, int8_t)
SHUFFLES(__vec16_i8, i8, int8_t)
@@ -441,20 +519,22 @@ SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
CMP_OP(__vec16_i16, int16_t, __equal, ==)
CMP_OP(__vec16_i16, int16_t, __not_equal, !=)
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec16_i16, int16_t, __signed_less_than, <)
CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i16, int16_t, __signed_greater_than, >)
CMP_OP(__vec16_i16, i16, int16_t, __equal, ==)
CMP_OP(__vec16_i16, i16, int16_t, __not_equal, !=)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_than, <)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >)
SELECT(__vec16_i16)
INSERT_EXTRACT(__vec16_i16, int16_t)
SMEAR(__vec16_i16, i16, int16_t)
SETZERO(__vec16_i16, i16)
UNDEF(__vec16_i16, i16)
BROADCAST(__vec16_i16, i16, int16_t)
ROTATE(__vec16_i16, i16, int16_t)
SHUFFLES(__vec16_i16, i16, int16_t)
@@ -484,20 +564,22 @@ SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
CMP_OP(__vec16_i32, int32_t, __equal, ==)
CMP_OP(__vec16_i32, int32_t, __not_equal, !=)
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec16_i32, int32_t, __signed_less_than, <)
CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i32, int32_t, __signed_greater_than, >)
CMP_OP(__vec16_i32, i32, int32_t, __equal, ==)
CMP_OP(__vec16_i32, i32, int32_t, __not_equal, !=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_than, <)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >)
SELECT(__vec16_i32)
INSERT_EXTRACT(__vec16_i32, int32_t)
SMEAR(__vec16_i32, i32, int32_t)
SETZERO(__vec16_i32, i32)
UNDEF(__vec16_i32, i32)
BROADCAST(__vec16_i32, i32, int32_t)
ROTATE(__vec16_i32, i32, int32_t)
SHUFFLES(__vec16_i32, i32, int32_t)
@@ -527,20 +609,22 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
CMP_OP(__vec16_i64, int64_t, __equal, ==)
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i64, int64_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <)
CMP_OP(__vec16_i64, int64_t, __signed_less_than, <)
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >)
CMP_OP(__vec16_i64, i64, int64_t, __equal, ==)
CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=)
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <)
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >)
SELECT(__vec16_i64)
INSERT_EXTRACT(__vec16_i64, int64_t)
SMEAR(__vec16_i64, i64, int64_t)
SETZERO(__vec16_i64, i64)
UNDEF(__vec16_i64, i64)
BROADCAST(__vec16_i64, i64, int64_t)
ROTATE(__vec16_i64, i64, int64_t)
SHUFFLES(__vec16_i64, i64, int64_t)
@@ -554,14 +638,14 @@ BINARY_OP(__vec16_f, __sub, -)
BINARY_OP(__vec16_f, __mul, *)
BINARY_OP(__vec16_f, __div, /)
CMP_OP(__vec16_f, float, __equal, ==)
CMP_OP(__vec16_f, float, __not_equal, !=)
CMP_OP(__vec16_f, float, __less_than, <)
CMP_OP(__vec16_f, float, __less_equal, <=)
CMP_OP(__vec16_f, float, __greater_than, >)
CMP_OP(__vec16_f, float, __greater_equal, >=)
CMP_OP(__vec16_f, float, float, __equal, ==)
CMP_OP(__vec16_f, float, float, __not_equal, !=)
CMP_OP(__vec16_f, float, float, __less_than, <)
CMP_OP(__vec16_f, float, float, __less_equal, <=)
CMP_OP(__vec16_f, float, float, __greater_than, >)
CMP_OP(__vec16_f, float, float, __greater_equal, >=)
static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
__vec16_i1 ret;
ret.v = 0;
for (int i = 0; i < 16; ++i)
@@ -569,6 +653,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
return ret;
}
static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
__vec16_i1 ret;
ret.v = 0;
for (int i = 0; i < 16; ++i)
ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
return ret;
}
#if 0
case Instruction::FRem: intrinsic = "__frem"; break;
#endif
@@ -576,11 +668,128 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
SELECT(__vec16_f)
INSERT_EXTRACT(__vec16_f, float)
SMEAR(__vec16_f, float, float)
SETZERO(__vec16_f, float)
UNDEF(__vec16_f, float)
BROADCAST(__vec16_f, float, float)
ROTATE(__vec16_f, float, float)
SHUFFLES(__vec16_f, float, float)
LOAD_STORE(__vec16_f, float)
static FORCEINLINE float __exp_uniform_float(float v) {
return expf(v);
}
static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
__vec16_f ret;
for (int i = 0; i < 16; ++i)
ret.v[i] = expf(v.v[i]);
return ret;
}
static FORCEINLINE float __log_uniform_float(float v) {
return logf(v);
}
static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) {
__vec16_f ret;
for (int i = 0; i < 16; ++i)
ret.v[i] = logf(v.v[i]);
return ret;
}
static FORCEINLINE float __pow_uniform_float(float a, float b) {
return powf(a, b);
}
static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) {
__vec16_f ret;
for (int i = 0; i < 16; ++i)
ret.v[i] = powf(a.v[i], b.v[i]);
return ret;
}
static FORCEINLINE int __intbits(float v) {
union {
float f;
int i;
} u;
u.f = v;
return u.i;
}
static FORCEINLINE float __floatbits(int v) {
union {
float f;
int i;
} u;
u.i = v;
return u.f;
}
static FORCEINLINE float __half_to_float_uniform(int16_t h) {
static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits
uint32_t exp = shifted_exp & o; // just the exponent
o += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
if (exp == shifted_exp) // Inf/NaN?
o += (128 - 16) << 23; // extra exp adjust
else if (exp == 0) { // Zero/Denormal?
o += 1 << 23; // extra exp adjust
o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
}
o |= ((int32_t)(h & 0x8000)) << 16; // sign bit
return __floatbits(o);
}
static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
__vec16_f ret;
for (int i = 0; i < 16; ++i)
ret.v[i] = __half_to_float_uniform(v.v[i]);
return ret;
}
static FORCEINLINE int16_t __float_to_half_uniform(float f) {
uint32_t sign_mask = 0x80000000u;
int32_t o;
int32_t fint = __intbits(f);
int32_t sign = fint & sign_mask;
fint ^= sign;
int32_t f32infty = 255 << 23;
o = (fint > f32infty) ? 0x7e00 : 0x7c00;
// (De)normalized number or zero
// update fint unconditionally to save the blending; we don't need it
// anymore for the Inf/NaN case anyway.
const uint32_t round_mask = ~0xfffu;
const int32_t magic = 15 << 23;
const int32_t f16infty = 31 << 23;
int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
if (fint < f32infty)
o = fint2 >> 13; // Take the bits!
return (o | (sign >> 16));
}
static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
__vec16_i16 ret;
for (int i = 0; i < 16; ++i)
ret.v[i] = __float_to_half_uniform(v.v[i]);
return ret;
}
///////////////////////////////////////////////////////////////////////////
// double
@@ -589,14 +798,14 @@ BINARY_OP(__vec16_d, __sub, -)
BINARY_OP(__vec16_d, __mul, *)
BINARY_OP(__vec16_d, __div, /)
CMP_OP(__vec16_d, double, __equal, ==)
CMP_OP(__vec16_d, double, __not_equal, !=)
CMP_OP(__vec16_d, double, __less_than, <)
CMP_OP(__vec16_d, double, __less_equal, <=)
CMP_OP(__vec16_d, double, __greater_than, >)
CMP_OP(__vec16_d, double, __greater_equal, >=)
CMP_OP(__vec16_d, double, double, __equal, ==)
CMP_OP(__vec16_d, double, double, __not_equal, !=)
CMP_OP(__vec16_d, double, double, __less_than, <)
CMP_OP(__vec16_d, double, double, __less_equal, <=)
CMP_OP(__vec16_d, double, double, __greater_than, >)
CMP_OP(__vec16_d, double, double, __greater_equal, >=)
static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.v = 0;
for (int i = 0; i < 16; ++i)
@@ -604,6 +813,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
return ret;
}
static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.v = 0;
for (int i = 0; i < 16; ++i)
ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
return ret;
}
#if 0
case Instruction::FRem: intrinsic = "__frem"; break;
#endif
@@ -611,6 +828,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
SELECT(__vec16_d)
INSERT_EXTRACT(__vec16_d, double)
SMEAR(__vec16_d, double, double)
SETZERO(__vec16_d, double)
UNDEF(__vec16_d, double)
BROADCAST(__vec16_d, double, double)
ROTATE(__vec16_d, double, double)
SHUFFLES(__vec16_d, double, double)
@@ -962,8 +1181,8 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
///////////////////////////////////////////////////////////////////////////
// masked load/store
static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
__vec16_i1 mask) {
static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
__vec16_i1 mask) {
__vec16_i8 ret;
int8_t *ptr = (int8_t *)p;
for (int i = 0; i < 16; ++i)
@@ -972,8 +1191,8 @@ static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
return ret;
}
static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
__vec16_i1 mask) {
static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
__vec16_i1 mask) {
__vec16_i16 ret;
int16_t *ptr = (int16_t *)p;
for (int i = 0; i < 16; ++i)
@@ -982,8 +1201,8 @@ static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
return ret;
}
static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
__vec16_i1 mask) {
static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
__vec16_i1 mask) {
__vec16_i32 ret;
int32_t *ptr = (int32_t *)p;
for (int i = 0; i < 16; ++i)
@@ -992,8 +1211,18 @@ static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
return ret;
}
static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
__vec16_i1 mask) {
static FORCEINLINE __vec16_f __masked_load_float(void *p,
__vec16_i1 mask) {
__vec16_f ret;
float *ptr = (float *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ret.v[i] = ptr[i];
return ret;
}
static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
__vec16_i1 mask) {
__vec16_i64 ret;
int64_t *ptr = (int64_t *)p;
for (int i = 0; i < 16; ++i)
@@ -1002,31 +1231,49 @@ static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
return ret;
}
static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
static FORCEINLINE __vec16_d __masked_load_double(void *p,
__vec16_i1 mask) {
__vec16_d ret;
double *ptr = (double *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ret.v[i] = ptr[i];
return ret;
}
static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
int8_t *ptr = (int8_t *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
__vec16_i1 mask) {
static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
__vec16_i1 mask) {
int16_t *ptr = (int16_t *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
int32_t *ptr = (int32_t *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
__vec16_i1 mask) {
float *ptr = (float *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
int64_t *ptr = (int64_t *)p;
for (int i = 0; i < 16; ++i)
@@ -1034,24 +1281,42 @@ static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
__masked_store_8(p, val, mask);
static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
__vec16_i1 mask) {
double *ptr = (double *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
__masked_store_16(p, val, mask);
__masked_store_i8(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
__masked_store_32(p, val, mask);
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
__vec16_i1 mask) {
__masked_store_i16(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
__masked_store_64(p, val, mask);
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
__masked_store_i32(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
__vec16_i1 mask) {
__masked_store_float(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
__masked_store_i64(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
__vec16_i1 mask) {
__masked_store_double(p, val, mask);
}
///////////////////////////////////////////////////////////////////////////
@@ -1060,29 +1325,31 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec16_i1 mask) { \
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, __vec16_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
}
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float)
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double)
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \
@@ -1095,39 +1362,46 @@ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \
return ret; \
}
GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8)
GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8)
GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8)
GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8)
GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __gather32_i16)
GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
GATHER_GENERAL(__vec16_f, float, __vec16_i32, __gather32_float)
GATHER_GENERAL(__vec16_f, float, __vec16_i64, __gather64_float)
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
GATHER_GENERAL(__vec16_d, double, __vec16_i32, __gather32_double)
GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double)
// scatter
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec16_i1 mask) { \
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, VTYPE val, \
__vec16_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
*ptr = val.v[i]; \
} \
}
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float)
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double)
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double)
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
@@ -1139,14 +1413,18 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
} \
}
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32)
SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
SCATTER_GENERAL(__vec16_f, float, __vec16_i32, __scatter32_float)
SCATTER_GENERAL(__vec16_f, float, __vec16_i64, __scatter64_float)
SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64)
SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
SCATTER_GENERAL(__vec16_d, double, __vec16_i32, __scatter32_double)
SCATTER_GENERAL(__vec16_d, double, __vec16_i64, __scatter64_double)
///////////////////////////////////////////////////////////////////////////
// packed load/store

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

2800
examples/intrinsics/knc.h Normal file

File diff suppressed because it is too large Load Diff

2058
examples/intrinsics/knc2x.h Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,10 @@ static inline int mandel(float c_re, float c_im, int count) {
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
unmasked {
z_re = c_re + new_re;
z_im = c_im + new_im;
}
}
return i;

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -41,8 +41,10 @@ mandel(float c_re, float c_im, int count) {
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
unmasked {
z_re = c_re + new_re;
z_im = c_im + new_im;
}
}
return i;
@@ -79,6 +81,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
uniform float dy = (y1 - y0) / height;
uniform int span = 4;
launch[height/span] < mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
maxIterations, output) >;
launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
maxIterations, output);
}

View File

@@ -77,7 +77,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
uniform int nTasks = max((int)64, (int)count/16384);
launch[nTasks] < bs_task(Sa, Xa, Ta, ra, va, result, count) >;
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
}
@@ -150,5 +150,5 @@ binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
uniform float va[], uniform float result[],
uniform int count) {
uniform int nTasks = max((int)64, (int)count/16384);
launch[nTasks] < binomial_task(Sa, Xa, Ta, ra, va, result, count) >;
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
}

View File

@@ -0,0 +1,7 @@
EXAMPLE=perbench
CPP_SRC=perfbench.cpp perfbench_serial.cpp
ISPC_SRC=perfbench.ispc
ISPC_TARGETS=sse2,sse4,avx
include ../common.mk

View File

@@ -0,0 +1,108 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif
#include <stdio.h>
#include <algorithm>
#include "../timing.h"
#include "perfbench_ispc.h"
typedef void (FuncType)(float *, int, float *, float *);
struct PerfTest {
FuncType *aFunc;
const char *aName;
FuncType *bFunc;
const char *bName;
const char *testName;
};
extern void xyzSumAOS(float *a, int count, float *zeros, float *result);
extern void xyzSumSOA(float *a, int count, float *zeros, float *result);
static void
lInitData(float *ptr, int count) {
for (int i = 0; i < count; ++i)
ptr[i] = float(i) / (1024.f * 1024.f);
}
static PerfTest tests[] = {
{ xyzSumAOS, "serial", ispc::xyzSumAOS, "ispc", "AOS vector element sum (with coalescing)" },
{ xyzSumAOS, "serial", ispc::xyzSumAOSStdlib, "ispc", "AOS vector element sum (stdlib swizzle)" },
{ xyzSumAOS, "serial", ispc::xyzSumAOSNoCoalesce, "ispc", "AOS vector element sum (no coalescing)" },
{ xyzSumSOA, "serial", ispc::xyzSumSOA, "ispc", "SOA vector element sum" },
{ ispc::gathers, "gather", ispc::loads, "vector load", "Memory reads" },
{ ispc::scatters, "scatter", ispc::stores, "vector store", "Memory writes" },
};
int main() {
int count = 3*64*1024;
float *a = new float[count];
float zeros[32] = { 0 };
int nTests = sizeof(tests) / sizeof(tests[0]);
for (int i = 0; i < nTests; ++i) {
lInitData(a, count);
reset_and_start_timer();
float resultA[3] = { 0, 0, 0 };
for (int j = 0; j < 100; ++j)
tests[i].aFunc(a, count, zeros, resultA);
double aTime = get_elapsed_mcycles();
lInitData(a, count);
reset_and_start_timer();
float resultB[3] = { 0, 0, 0 };
for (int j = 0; j < 100; ++j)
tests[i].bFunc(a, count, zeros, resultB);
double bTime = get_elapsed_mcycles();
printf("%-40s: [%.2f] M cycles %s, [%.2f] M cycles %s (%.2fx speedup).\n",
tests[i].testName, aTime, tests[i].aName, bTime, tests[i].bName,
aTime/bTime);
#if 0
printf("\t(%f %f %f) - (%f %f %f)\n", resultSerial[0], resultSerial[1],
resultSerial[2], resultISPC[0], resultISPC[1], resultISPC[2]);
#endif
}
return 0;
}

View File

@@ -0,0 +1,170 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
export void xyzSumAOS(uniform float array[], uniform int count,
uniform float zeros[], uniform float result[]) {
float xsum = 0, ysum = 0, zsum = 0;
foreach (i = 0 ... count/3) {
float x = array[3*i];
float y = array[3*i+1];
float z = array[3*i+2];
xsum += x;
ysum += y;
zsum += z;
}
result[0] = reduce_add(xsum);
result[1] = reduce_add(ysum);
result[2] = reduce_add(zsum);
}
export void xyzSumAOSStdlib(uniform float array[], uniform int count,
uniform float zeros[], uniform float result[]) {
float xsum = 0, ysum = 0, zsum = 0;
for (uniform int i = 0; i < 64*1024 /*count/3*/; i += programCount) {
float x, y, z;
aos_to_soa3(&array[3*i], &x, &y, &z);
xsum += x;
ysum += y;
zsum += z;
}
result[0] = reduce_add(xsum);
result[1] = reduce_add(ysum);
result[2] = reduce_add(zsum);
}
export void xyzSumAOSNoCoalesce(uniform float array[], uniform int count,
uniform float zerosArray[], uniform float result[]) {
int zeros = zerosArray[programIndex];
float xsum = 0, ysum = 0, zsum = 0;
foreach (i = 0 ... count/3) {
float x = array[3*i+zeros];
float y = array[3*i+1+zeros];
float z = array[3*i+2+zeros];
xsum += x;
ysum += y;
zsum += z;
}
result[0] = reduce_add(xsum);
result[1] = reduce_add(ysum);
result[2] = reduce_add(zsum);
}
export void xyzSumSOA(uniform float array[], uniform int count,
uniform float zeros[], uniform float result[]) {
float xsum = 0, ysum = 0, zsum = 0;
uniform float * uniform ap = array;
assert(programCount <= 8);
for (uniform int i = 0; i < count/3; i += 8, ap += 24) {
for (uniform int j = 0; j < 8; j += programCount) {
float x = ap[j + programIndex];
float y = ap[8 + j + programIndex];
float z = ap[16 + j + programIndex];
xsum += x;
ysum += y;
zsum += z;
}
}
result[0] = reduce_add(xsum);
result[1] = reduce_add(ysum);
result[2] = reduce_add(zsum);
}
export void gathers(uniform float array[], uniform int count,
uniform float zeros[], uniform float result[]) {
float sum = 0;
int zero = zeros[programIndex];
foreach (i = 0 ... count)
sum += array[i + zero];
result[0] = reduce_add(sum);
}
export void loads(uniform float array[], uniform int count,
uniform float zeros[], uniform float result[]) {
float sum = 0;
foreach (i = 0 ... count)
sum += array[i];
result[0] = reduce_add(sum);
}
export void scatters(uniform float array[], uniform int count,
uniform float zeros[], uniform float result[]) {
int zero = zeros[programIndex];
foreach (i = 0 ... count)
array[i + zero] = zero;
}
export void stores(uniform float array[], uniform int count,
uniform float zeros[], uniform float result[]) {
int zero = zeros[programIndex];
foreach (i = 0 ... count)
array[i] = zero;
}
export void normalizeAOSNoCoalesce(uniform float array[], uniform int count,
uniform float zeroArray[]) {
float zeros = zeroArray[programIndex];
foreach (i = 0 ... count/3) {
float x = array[3*i+zeros];
float y = array[3*i+1+zeros];
float z = array[3*i+2+zeros];
float l2 = x*x + y*y + z*z;
array[3*i] /= l2;
array[3*i+1] /= l2;
array[3*i+2] /= l2;
}
}
export void normalizeSOA(uniform float array[], uniform int count,
uniform float zeros[]) {
foreach (i = 0 ... count/3) {
float x = array[3*i];
float y = array[3*i+1];
float z = array[3*i+2];
float l2 = x*x + y*y + z*z;
array[3*i] /= l2;
array[3*i+1] /= l2;
array[3*i+2] /= l2;
}
}

View File

@@ -0,0 +1,175 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{d923bb7e-a7c8-4850-8fcf-0eb9ce35b4e8}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>perfbench</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="perfbench.cpp" />
<ClCompile Include="perfbench_serial.cpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="perfbench.ispc">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
</CustomBuild>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@@ -0,0 +1,61 @@
/*
Copyright (c) 2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <math.h>
void
xyzSumAOS(float *a, int count, float *zeros, float *result) {
float xsum = 0, ysum = 0, zsum = 0;
for (int i = 0; i < count; i += 3) {
xsum += a[i];
ysum += a[i+1];
zsum += a[i+2];
}
result[0] = xsum;
result[1] = ysum;
result[2] = zsum;
}
void
xyzSumSOA(float *a, int count, float *zeros, float *result) {
float xsum = 0, ysum = 0, zsum = 0;
for (int i = 0; i < count/3; ++i) {
float *p = a + (i >> 3) * 24 + (i & 7);
xsum += p[0];
ysum += p[8];
zsum += p[16];
}
result[0] = xsum;
result[1] = ysum;
result[2] = zsum;
}

View File

@@ -43,17 +43,17 @@ struct Ray {
};
struct Triangle {
uniform float p[3][4];
uniform int id;
uniform int pad[3];
float p[3][4];
int id;
int pad[3];
};
struct LinearBVHNode {
uniform float bounds[2][3];
uniform unsigned int offset; // num primitives for leaf, second child for interior
uniform unsigned int8 nPrimitives;
uniform unsigned int8 splitAxis;
uniform unsigned int16 pad;
float bounds[2][3];
unsigned int offset; // num primitives for leaf, second child for interior
unsigned int8 nPrimitives;
unsigned int8 splitAxis;
unsigned int16 pad;
};
static inline float3 Cross(const float3 v1, const float3 v2) {
@@ -88,9 +88,12 @@ static void generateRay(uniform const float raster2camera[4][4],
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
ray.origin.y = camera2world[1][3] / camera2world[3][3];
@@ -143,7 +146,7 @@ static bool BBoxIntersect(const uniform float bounds[2][3],
static bool TriIntersect(const Triangle &tri, Ray &ray) {
static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
@@ -183,8 +186,8 @@ static bool TriIntersect(const Triangle &tri, Ray &ray) {
}
bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
Ray &r) {
bool BVHIntersect(const uniform LinearBVHNode nodes[],
const uniform Triangle tris[], Ray &r) {
Ray ray = r;
bool hit = false;
// Follow ray through BVH nodes to find primitive intersections
@@ -193,7 +196,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
while (true) {
// Check ray against BVH node
LinearBVHNode node = nodes[nodeNum];
uniform LinearBVHNode node = nodes[nodeNum];
if (any(BBoxIntersect(node.bounds, ray))) {
uniform unsigned int nPrimitives = node.nPrimitives;
if (nPrimitives > 0) {
@@ -239,8 +242,8 @@ static void raytrace_tile(uniform int x0, uniform int x1,
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
const uniform LinearBVHNode nodes[],
const uniform Triangle triangles[]) {
uniform float widthScale = (float)(baseWidth) / (float)(width);
uniform float heightScale = (float)(baseHeight) / (float)(height);
@@ -262,8 +265,8 @@ export void raytrace_ispc(uniform int width, uniform int height,
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
const uniform LinearBVHNode nodes[],
const uniform Triangle triangles[]) {
raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image,
id, nodes, triangles);
@@ -275,8 +278,8 @@ task void raytrace_tile_task(uniform int width, uniform int height,
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
const uniform LinearBVHNode nodes[],
const uniform Triangle triangles[]) {
uniform int dx = 16, dy = 16; // must match dx, dy below
uniform int xBuckets = (width + (dx-1)) / dx;
uniform int x0 = (taskIndex % xBuckets) * dx;
@@ -295,14 +298,14 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
const uniform LinearBVHNode nodes[],
const uniform Triangle triangles[]) {
uniform int dx = 16, dy = 16;
uniform int xBuckets = (width + (dx-1)) / dx;
uniform int yBuckets = (height + (dy-1)) / dy;
uniform int nTasks = xBuckets * yBuckets;
launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
image, id, nodes, triangles) >;
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
image, id, nodes, triangles);
}

View File

@@ -123,9 +123,12 @@ static void generateRay(const float raster2camera[4][4],
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
ray.origin.y = camera2world[1][3] / camera2world[3][3];

View File

@@ -88,11 +88,11 @@ loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
// Parallelize across cores as well: each task will work on a slice
// of 1 in the z extent of the volume.
if ((t & 1) == 0)
launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
coef, vsq, Aeven, Aodd) >;
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
coef, vsq, Aeven, Aodd);
else
launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
coef, vsq, Aodd, Aeven) >;
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
coef, vsq, Aodd, Aeven);
// We need to wait for all of the launched tasks to finish before
// starting the next iteration.

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -40,21 +40,68 @@
Runtime Requirements" for information about the task-related entrypoints
that are implemented here.
There are three task systems in this file: one built using Microsoft's
Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
one built on top of bare pthreads.
There are several task systems in this file, built using:
- Microsoft's Concurrency Runtime (ISPC_USE_CONCRT)
- Apple's Grand Central Dispatch (ISPC_USE_GCD)
- bare pthreads (ISPC_USE_PTHREADS, ISPC_USE_PTHREADS_FULLY_SUBSCRIBED)
- Cilk Plus (ISPC_USE_CILK)
- TBB (ISPC_USE_TBB_TASK_GROUP, ISPC_USE_TBB_PARALLEL_FOR)
- OpenMP (ISPC_USE_OMP)
The task system implementation can be selected at compile time, by defining
the appropriate preprocessor symbol on the command line (for e.g.: -D ISPC_USE_TBB).
Not all combinations of platform and task system are meaningful.
If no task system is requested, a reasonable default task system for the platform
is selected. Here are the task systems that can be selected:
#define ISPC_USE_GCD
#define ISPC_USE_CONCRT
#define ISPC_USE_PTHREADS
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#define ISPC_USE_CILK
#define ISPC_USE_OMP
#define ISPC_USE_TBB_TASK_GROUP
#define ISPC_USE_TBB_PARALLEL_FOR
The ISPC_USE_PTHREADS_FULLY_SUBSCRIBED model essentially takes over the machine
by assigning one pthread to each hyper-thread, and then uses spinlocks and atomics
for task management. This model is useful for KNC where tasks can take over
the machine, but less so when there are other tasks that need running on the machine.
#define ISPC_USE_CREW
*/
#if !(defined ISPC_USE_CONCRT || defined ISPC_USE_GCD || \
defined ISPC_USE_PTHREADS || defined ISPC_USE_PTHREADS_FULLY_SUBSCRIBED || \
defined ISPC_USE_TBB_TASK_GROUP || defined ISPC_USE_TBB_PARALLEL_FOR || \
defined ISPC_USE_OMP || defined ISPC_USE_CILK )
// If no task model chosen from the compiler cmdline, pick a reasonable default
#if defined(_WIN32) || defined(_WIN64)
#define ISPC_USE_CONCRT
#elif defined(__linux__)
#define ISPC_USE_PTHREADS
#elif defined(__APPLE__)
#define ISPC_USE_GCD
#endif
#if defined(__KNC__)
#define ISPC_USE_PTHREADS
#endif
#endif // No task model specified on compiler cmdline
#if defined(_WIN32) || defined(_WIN64)
#define ISPC_IS_WINDOWS
#define ISPC_USE_CONCRT
#define ISPC_IS_WINDOWS
#elif defined(__linux__)
#define ISPC_IS_LINUX
#define ISPC_USE_PTHREADS
#define ISPC_IS_LINUX
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#define ISPC_USE_GCD
#define ISPC_IS_APPLE
#endif
#if defined(__KNC__)
#define ISPC_IS_KNC
#endif
#define DBG(x)
@@ -83,9 +130,37 @@
#include <vector>
#include <algorithm>
#endif // ISPC_USE_PTHREADS
#ifdef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#include <pthread.h>
#include <semaphore.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <sys/sysctl.h>
#include <vector>
#include <algorithm>
//#include <stdexcept>
#include <stack>
#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#ifdef ISPC_USE_TBB_PARALLEL_FOR
#include <tbb/parallel_for.h>
#endif // ISPC_USE_TBB_PARALLEL_FOR
#ifdef ISPC_USE_TBB_TASK_GROUP
#include <tbb/task_group.h>
#endif // ISPC_USE_TBB_TASK_GROUP
#ifdef ISPC_USE_CILK
#include <cilk/cilk.h>
#endif // ISPC_USE_TBB
#ifdef ISPC_USE_OMP
#include <omp.h>
#endif // ISPC_USE_OMP
#ifdef ISPC_IS_LINUX
#include <malloc.h>
#endif // ISPC_IS_LINUX
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
@@ -107,6 +182,13 @@ struct TaskInfo {
#endif
};
// ispc expects these functions to have C linkage / not be mangled
extern "C" {
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
void ISPCSync(void *handle);
}
///////////////////////////////////////////////////////////////////////////
// TaskGroupBase
@@ -181,7 +263,7 @@ inline TaskGroupBase::~TaskGroupBase() {
// Note: don't delete memBuffers[0], since it points to the start of
// the "mem" member!
for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
delete[] memBuffers[i];
delete[](memBuffers[i]);
}
@@ -224,10 +306,10 @@ TaskGroupBase::GetTaskInfo(int index) {
inline void *
TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
char *basePtr = memBuffers[curMemBuffer];
int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);
iptr = (iptr + (alignment-1)) & ~(alignment-1);
int newOffset = int(iptr + size - (int64_t)basePtr);
int newOffset = int(iptr - (intptr_t)basePtr + size);
if (newOffset < memBufferSize[curMemBuffer]) {
curMemBufferOffset = newOffset;
return (char *)iptr;
@@ -249,14 +331,6 @@ TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
///////////////////////////////////////////////////////////////////////////
// Atomics and the like
#ifndef ISPC_IS_WINDOWS
static inline void
lMemFence() {
__asm__ __volatile__("mfence":::"memory");
}
#endif // !ISPC_IS_WINDOWS
#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
#define ISPC_POINTER_BYTES 4
#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
@@ -266,6 +340,15 @@ lMemFence() {
#endif // __SIZEOF_POINTER__
static inline void
lMemFence() {
// Windows atomic functions already contain the fence
// KNC doesn't need the memory barrier
#if !defined ISPC_IS_KNC || !defined ISPC_IS_WINDOWS
__asm__ __volatile__("mfence":::"memory");
#endif
}
static void *
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
#ifdef ISPC_IS_WINDOWS
@@ -288,11 +371,11 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
#endif // ISPC_IS_WINDOWS
}
#ifndef ISPC_IS_WINDOWS
static int32_t
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
#ifdef ISPC_IS_WINDOWS
return InterlockedCompareExchange(v, newValue, oldValue);
#else
int32_t result;
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
: "=a"(result), "=m"(*v)
@@ -300,9 +383,22 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
: "memory");
lMemFence();
return result;
#endif // ISPC_IS_WINDOWS
}
#endif // !ISPC_IS_WINDOWS
static inline int32_t
lAtomicAdd(volatile int32_t *v, int32_t delta) {
#ifdef ISPC_IS_WINDOWS
return InterlockedAdd(v, delta);
#else
int32_t origValue;
__asm__ __volatile__("lock\n"
"xaddl %0,%1"
: "=r"(origValue), "=m"(*v) : "0"(delta)
: "memory");
return origValue;
#endif
}
///////////////////////////////////////////////////////////////////////////
@@ -366,6 +462,50 @@ private:
#endif // ISPC_USE_PTHREADS
#ifdef ISPC_USE_CILK
class TaskGroup : public TaskGroupBase {
public:
void Launch(int baseIndex, int count);
void Sync();
};
#endif // ISPC_USE_CILK
#ifdef ISPC_USE_OMP
class TaskGroup : public TaskGroupBase {
public:
void Launch(int baseIndex, int count);
void Sync();
};
#endif // ISPC_USE_OMP
#ifdef ISPC_USE_TBB_PARALLEL_FOR
class TaskGroup : public TaskGroupBase {
public:
void Launch(int baseIndex, int count);
void Sync();
};
#endif // ISPC_USE_TBB_PARALLEL_FOR
#ifdef ISPC_USE_TBB_TASK_GROUP
class TaskGroup : public TaskGroupBase {
public:
void Launch(int baseIndex, int count);
void Sync();
private:
tbb::task_group tbbTaskGroup;
};
#endif // ISPC_USE_TBB_TASK_GROUP
///////////////////////////////////////////////////////////////////////////
// Grand Central Dispatch
@@ -487,18 +627,6 @@ static pthread_mutex_t taskSysMutex;
static std::vector<TaskGroup *> activeTaskGroups;
static sem_t *workerSemaphore;
static inline int32_t
lAtomicAdd(int32_t *v, int32_t delta) {
int32_t origValue;
__asm__ __volatile__("lock\n"
"xaddl %0,%1"
: "=r"(origValue), "=m"(*v) : "0"(delta)
: "memory");
return origValue;
}
static void *
lTaskEntry(void *arg) {
int threadIndex = (int)((int64_t)arg);
@@ -724,11 +852,15 @@ TaskGroup::Sync() {
exit(1);
}
// FIXME: We basically end up busy-waiting here, which is
// extra wasteful in a world with hyperthreading. It would
// extra wasteful in a world with hyper-threading. It would
// be much better to put this thread to sleep on a
// condition variable that was signaled when the last task
// in this group was finished.
sleep(0);
#ifndef ISPC_IS_KNC
usleep(1);
#else
_mm_delay_32(8);
#endif
continue;
}
@@ -772,6 +904,124 @@ TaskGroup::Sync() {
#endif // ISPC_USE_PTHREADS
///////////////////////////////////////////////////////////////////////////
// Cilk Plus
#ifdef ISPC_USE_CILK
static void
InitTaskSystem() {
// No initialization needed
}
inline void
TaskGroup::Launch(int baseIndex, int count) {
cilk_for(int i = 0; i < count; i++) {
TaskInfo *ti = GetTaskInfo(baseIndex + i);
// Actually run the task.
// Cilk does not expose the task -> thread mapping so we pretend it's 1:1
ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount);
}
}
inline void
TaskGroup::Sync() {
}
#endif // ISPC_USE_CILK
///////////////////////////////////////////////////////////////////////////
// OpenMP
#ifdef ISPC_USE_OMP
static void
InitTaskSystem() {
// No initialization needed
}
inline void
TaskGroup::Launch(int baseIndex, int count) {
#pragma omp parallel for
for(int i = 0; i < count; i++) {
TaskInfo *ti = GetTaskInfo(baseIndex + i);
// Actually run the task.
int threadIndex = omp_get_thread_num();
int threadCount = omp_get_num_threads();
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
}
}
inline void
TaskGroup::Sync() {
}
#endif // ISPC_USE_OMP
///////////////////////////////////////////////////////////////////////////
// Thread Building Blocks
#ifdef ISPC_USE_TBB_PARALLEL_FOR
static void
InitTaskSystem() {
// No initialization needed by default
//tbb::task_scheduler_init();
}
inline void
TaskGroup::Launch(int baseIndex, int count) {
tbb::parallel_for(0, count, [=](int i) {
TaskInfo *ti = GetTaskInfo(baseIndex + i);
// Actually run the task.
// TBB does not expose the task -> thread mapping so we pretend it's 1:1
int threadIndex = ti->taskIndex;
int threadCount = ti->taskCount;
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
});
}
inline void
TaskGroup::Sync() {
}
#endif // ISPC_USE_TBB_PARALLEL_FOR
#ifdef ISPC_USE_TBB_TASK_GROUP
static void
InitTaskSystem() {
// No initialization needed by default
//tbb::task_scheduler_init();
}
inline void
TaskGroup::Launch(int baseIndex, int count) {
for (int i = 0; i < count; i++) {
tbbTaskGroup.run([=]() {
TaskInfo *ti = GetTaskInfo(baseIndex + i);
// TBB does not expose the task -> thread mapping so we pretend it's 1:1
int threadIndex = ti->taskIndex;
int threadCount = ti->taskCount;
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
});
}
}
inline void
TaskGroup::Sync() {
tbbTaskGroup.wait();
}
#endif // ISPC_USE_TBB_TASK_GROUP
///////////////////////////////////////////////////////////////////////////
#ifndef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#define MAX_FREE_TASK_GROUPS 64
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
@@ -783,7 +1033,6 @@ AllocTaskGroup() {
if (tg != NULL) {
void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
if (ptr != NULL) {
assert(ptr == tg);
return (TaskGroup *)ptr;
}
}
@@ -810,13 +1059,6 @@ FreeTaskGroup(TaskGroup *tg) {
///////////////////////////////////////////////////////////////////////////
// ispc expects these functions to have C linkage / not be mangled
extern "C" {
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
void ISPCSync(void *handle);
}
void
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
TaskGroup *taskGroup;
@@ -863,3 +1105,250 @@ ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
return taskGroup->AllocMemory(size, alignment);
}
#else // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#define MAX_LIVE_TASKS 1024
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
// Small structure used to hold the data for each task
struct Task {
public:
TaskFuncType func;
void *data;
volatile int32_t taskIndex;
int taskCount;
volatile int numDone;
int liveIndex; // index in live task queue
inline int noMoreWork() { return taskIndex >= taskCount; }
/*! given thread is done working on this task --> decrease num locks */
// inline void lock() { lAtomicAdd(&locks,1); }
// inline void unlock() { lAtomicAdd(&locks,-1); }
inline int nextJob() { return lAtomicAdd(&taskIndex,1); }
inline int numJobs() { return taskCount; }
inline void schedule(int idx) { taskIndex = 0; numDone = 0; liveIndex = idx; }
inline void run(int idx, int threadIdx);
inline void markOneDone() { lAtomicAdd(&numDone,1); }
inline void wait()
{
while (!noMoreWork()) {
int next = nextJob();
if (next < numJobs()) run(next, 0);
}
while (numDone != taskCount) {
#ifndef ISPC_IS_KNC
usleep(1);
#else
_mm_delay_32(8);
#endif
}
}
};
///////////////////////////////////////////////////////////////////////////
class TaskSys {
static int numThreadsRunning;
struct LiveTask
{
volatile int locks; /*!< num locks on this task. gets
initialized to NUM_THREADS+1, then counted
down by every thread that sees this. this
value is only valid when 'active' is set
to true */
volatile int active; /*! workers will spin on this until it
becomes active */
Task *task;
inline void doneWithThis() { lAtomicAdd(&locks,-1); }
LiveTask() : active(0), locks(-1) {}
};
public:
volatile int nextScheduleIndex; /*! next index in the task queue
where we'll insert a live task */
// inline int inc_begin() { int old = begin; begin = (begin+1)%MAX_TASKS; return old; }
// inline int inc_end() { int old = end; end = (end+1)%MAX_TASKS; return old; }
LiveTask taskQueue[MAX_LIVE_TASKS];
std::stack<Task *> taskMem;
static TaskSys *global;
TaskSys() : nextScheduleIndex(0)
{
TaskSys::global = this;
Task *mem = new Task[MAX_LIVE_TASKS]; //< could actually be more than _live_ tasks
for (int i=0;i<MAX_LIVE_TASKS;i++) {
taskMem.push(mem+i);
}
createThreads();
}
inline Task *allocOne()
{
pthread_mutex_lock(&mutex);
if (taskMem.empty()) {
fprintf(stderr, "Too many live tasks. "
"Change the value of MAX_LIVE_TASKS and recompile.\n");
exit(1);
}
Task *task = taskMem.top();
taskMem.pop();
pthread_mutex_unlock(&mutex);
return task;
}
static inline void init()
{
if (global) return;
pthread_mutex_lock(&mutex);
if (global == NULL) global = new TaskSys;
pthread_mutex_unlock(&mutex);
}
void createThreads();
int nThreads;
pthread_t *thread;
void threadFct();
inline void schedule(Task *t)
{
pthread_mutex_lock(&mutex);
int liveIndex = nextScheduleIndex;
nextScheduleIndex = (nextScheduleIndex+1)%MAX_LIVE_TASKS;
if (taskQueue[liveIndex].active) {
fprintf(stderr, "Out of task queue resources. "
"Change the value of MAX_LIVE_TASKS and recompile.\n");
exit(1);
}
taskQueue[liveIndex].task = t;
t->schedule(liveIndex);
taskQueue[liveIndex].locks = numThreadsRunning+1; // num _worker_ threads plus creator
taskQueue[liveIndex].active = true;
pthread_mutex_unlock(&mutex);
}
void sync(Task *task)
{
task->wait();
int liveIndex = task->liveIndex;
while (taskQueue[liveIndex].locks > 1) {
#ifndef ISPC_IS_KNC
usleep(1);
#else
_mm_delay_32(8);
#endif
}
_mm_free(task->data);
pthread_mutex_lock(&mutex);
taskMem.push(task); // recycle task index
taskQueue[liveIndex].active = false;
pthread_mutex_unlock(&mutex);
}
};
void TaskSys::threadFct()
{
int myIndex = 0; //lAtomicAdd(&threadIdx,1);
while (1) {
while (!taskQueue[myIndex].active) {
#ifndef ISPC_IS_KNC
usleep(4);
#else
_mm_delay_32(32);
#endif
continue;
}
Task *mine = taskQueue[myIndex].task;
while (!mine->noMoreWork()) {
int job = mine->nextJob();
if (job >= mine->numJobs()) break;
mine->run(job,myIndex);
}
taskQueue[myIndex].doneWithThis();
myIndex = (myIndex+1)%MAX_LIVE_TASKS;
}
}
inline void Task::run(int idx, int threadIdx) {
(*this->func)(data,threadIdx,TaskSys::global->nThreads,idx,taskCount);
markOneDone();
}
void *_threadFct(void *data) {
((TaskSys*)data)->threadFct();
return NULL;
}
void TaskSys::createThreads()
{
init();
int reserved = 4;
int minid = 2;
nThreads = sysconf(_SC_NPROCESSORS_ONLN) - reserved;
thread = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
numThreadsRunning = 0;
for (int i = 0; i < nThreads; ++i) {
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 2*1024 * 1024);
int threadID = minid+i;
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(threadID,&cpuset);
int ret = pthread_attr_setaffinity_np(&attr,sizeof(cpuset),&cpuset);
int err = pthread_create(&thread[i], &attr, &_threadFct, this);
++numThreadsRunning;
if (err != 0) {
fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
exit(1);
}
}
}
TaskSys * TaskSys::global = NULL;
int TaskSys::numThreadsRunning = 0;
///////////////////////////////////////////////////////////////////////////
void ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count)
{
Task *ti = *(Task**)taskGroupPtr;
ti->func = (TaskFuncType)func;
ti->data = data;
ti->taskIndex = 0;
ti->taskCount = count;
TaskSys::global->schedule(ti);
}
void ISPCSync(void *h)
{
Task *task = (Task *)h;
assert(task);
TaskSys::global->sync(task);
}
void *ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
{
TaskSys::init();
Task *task = TaskSys::global->allocOne();
*taskGroupPtr = task;
task->data = _mm_malloc(size,alignment);
return task->data;//*taskGroupPtr;
}
#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED

View File

@@ -43,9 +43,15 @@ extern "C" {
#endif /* __cplusplus */
__inline__ uint64_t rdtsc() {
uint32_t low, high;
#ifdef __x86_64
__asm__ __volatile__ (
"xorl %%eax,%%eax \n cpuid"
::: "%rax", "%rbx", "%rcx", "%rdx" );
#else
__asm__ __volatile__ (
"xorl %%eax,%%eax \n cpuid"
::: "%eax", "%ebx", "%ecx", "%edx" );
#endif
__asm__ __volatile__ (
"rdtsc" : "=a" (low), "=d" (high));
return (uint64_t)high << 32 | low;

View File

@@ -336,6 +336,6 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
// Launch tasks to work on (dx,dy)-sized tiles of the image
uniform int dx = 8, dy = 8;
uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world,
width, height, image) >;
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
width, height, image);
}

3371
expr.cpp

File diff suppressed because it is too large Load Diff

71
expr.h
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -284,6 +284,10 @@ public:
int EstimateCost() const;
Expr *baseExpr, *index;
private:
mutable const Type *type;
mutable const PointerType *lvalueType;
};
@@ -299,7 +303,6 @@ public:
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
const Type *GetType() const;
const Type *GetLValueType() const;
Symbol *GetBaseSymbol() const;
void Print() const;
Expr *Optimize();
@@ -321,6 +324,9 @@ public:
member is found. (i.e. this is true if the MemberExpr was a '->'
operator, and is false if it was a '.' operator. */
bool dereferenceExpr;
protected:
mutable const Type *type, *lvalueType;
};
@@ -531,26 +537,48 @@ public:
};
/** @brief Expression that represents dereferencing a reference to get its
value. */
class DereferenceExpr : public Expr {
/** @brief Common base class that provides shared functionality for
PtrDerefExpr and RefDerefExpr. */
class DerefExpr : public Expr {
public:
DereferenceExpr(Expr *e, SourcePos p);
DerefExpr(Expr *e, SourcePos p);
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
const Type *GetType() const;
const Type *GetLValueType() const;
Symbol *GetBaseSymbol() const;
void Print() const;
Expr *TypeCheck();
Expr *Optimize();
int EstimateCost() const;
Expr *expr;
};
/** @brief Expression that represents dereferencing a pointer to get its
value. */
class PtrDerefExpr : public DerefExpr {
public:
PtrDerefExpr(Expr *e, SourcePos p);
const Type *GetType() const;
void Print() const;
Expr *TypeCheck();
int EstimateCost() const;
};
/** @brief Expression that represents dereferencing a reference to get its
value. */
class RefDerefExpr : public DerefExpr {
public:
RefDerefExpr(Expr *e, SourcePos p);
const Type *GetType() const;
void Print() const;
Expr *TypeCheck();
int EstimateCost() const;
};
/** Expression that represents taking the address of an expression. */
class AddressOfExpr : public Expr {
public:
@@ -563,6 +591,7 @@ public:
Expr *TypeCheck();
Expr *Optimize();
int EstimateCost() const;
llvm::Constant *GetConstant(const Type *type) const;
Expr *expr;
};
@@ -630,20 +659,26 @@ public:
function overloading, this method resolves which actual function
the arguments match best. If the argCouldBeNULL parameter is
non-NULL, each element indicates whether the corresponding argument
is the number zero, indicating that it could be a NULL pointer.
This parameter may be NULL (for cases where overload resolution is
being done just given type information without the parameter
argument expressions being available. It returns true on success.
is the number zero, indicating that it could be a NULL pointer, and
if argIsConstant is non-NULL, each element indicates whether the
corresponding argument is a compile-time constant value. Both of
these parameters may be NULL (for cases where overload resolution
is being done just given type information without the parameter
argument expressions being available. This function returns true
on success.
*/
bool ResolveOverloads(SourcePos argPos,
const std::vector<const Type *> &argTypes,
const std::vector<bool> *argCouldBeNULL = NULL);
const std::vector<bool> *argCouldBeNULL = NULL,
const std::vector<bool> *argIsConstant = NULL);
Symbol *GetMatchingFunction();
private:
bool tryResolve(int (*matchFunc)(const Type *, const Type *),
SourcePos argPos, const std::vector<const Type *> &argTypes,
const std::vector<bool> *argCouldBeNULL);
std::vector<Symbol *> getCandidateFunctions(int argCount) const;
static int computeOverloadCost(const FunctionType *ftype,
const std::vector<const Type *> &argTypes,
const std::vector<bool> *argCouldBeNULL,
const std::vector<bool> *argIsConstant);
/** Name of the function that is being called. */
std::string name;

140
func.cpp
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,21 @@
#include "util.h"
#include <stdio.h>
#include <llvm/LLVMContext.h>
#include <llvm/Module.h>
#include <llvm/Type.h>
#include <llvm/DerivedTypes.h>
#include <llvm/Instructions.h>
#include <llvm/Intrinsics.h>
#if defined(LLVM_3_1) || defined(LLVM_3_2)
#include <llvm/LLVMContext.h>
#include <llvm/Module.h>
#include <llvm/Type.h>
#include <llvm/Instructions.h>
#include <llvm/Intrinsics.h>
#include <llvm/DerivedTypes.h>
#else
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Type.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/Intrinsics.h>
#include <llvm/IR/DerivedTypes.h>
#endif
#include <llvm/PassManager.h>
#include <llvm/PassRegistry.h>
#include <llvm/Transforms/IPO.h>
@@ -59,16 +68,14 @@
#include <llvm/Support/FileUtilities.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/Target/TargetOptions.h>
#include <llvm/Target/TargetData.h>
#include <llvm/PassManager.h>
#include <llvm/Analysis/Verifier.h>
#include <llvm/Support/CFG.h>
#include <llvm/Support/ToolOutputFile.h>
#include <llvm/Assembly/PrintModulePass.h>
Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
Function::Function(Symbol *s, Stmt *c) {
sym = s;
args = a;
code = c;
maskSymbol = m->symbolTable->LookupVariable("__mask");
@@ -101,12 +108,20 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
printf("\n\n\n");
}
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
for (unsigned int i = 0; i < args.size(); ++i)
if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
args[i]->parentFunction = this;
for (int i = 0; i < type->GetNumParameters(); ++i) {
const char *paramName = type->GetParameterName(i).c_str();
Symbol *sym = m->symbolTable->LookupVariable(paramName);
if (sym == NULL)
Assert(strncmp(paramName, "__anon_parameter_", 17) == 0);
args.push_back(sym);
const Type *t = type->GetParameterType(i);
if (sym != NULL && CastType<ReferenceType>(t) == NULL)
sym->parentFunction = this;
}
if (type->isTask) {
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
@@ -125,7 +140,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
const Type *
Function::GetReturnType() const {
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
return type->GetReturnType();
}
@@ -133,7 +148,7 @@ Function::GetReturnType() const {
const FunctionType *
Function::GetType() const {
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
return type;
}
@@ -145,7 +160,8 @@ Function::GetType() const {
'mem2reg' pass will in turn promote to SSA registers..
*/
static void
lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const
std::vector<Symbol *> &args,
FunctionEmitContext *ctx) {
// We expect the argument structure to come in as a poitner to a
// structure. Confirm and figure out its type here.
@@ -157,9 +173,13 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
// Get the type of the argument we're copying in and its Symbol pointer
LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
llvm::Type *argType = argStructType->getElementType(i);
Symbol *sym = args[i];
if (sym == NULL)
// anonymous parameter, so don't worry about it
return;
// allocate space to copy the parameter in to
sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
@@ -170,7 +190,7 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
// memory
llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
ctx->StoreInst(ptrval, sym->storagePtr);
ctx->EmitFunctionParameterDebugInfo(sym);
ctx->EmitFunctionParameterDebugInfo(sym, i);
}
@@ -186,14 +206,14 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
// value
maskSymbol->storagePtr = ctx->GetFullMaskPointer();
// add debugging info for __mask, programIndex, ...
// add debugging info for __mask
maskSymbol->pos = firstStmtPos;
ctx->EmitVariableDebugInfo(maskSymbol);
#if 0
llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
#endif
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
if (type->isTask == true) {
// For tasks, we there should always be three parmeters: the
@@ -211,13 +231,15 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
for (unsigned int i = 0; i < args.size(); ++i)
lCopyInTaskParameter(i, structParamPtr, args, ctx);
// Copy in the mask as well.
int nArgs = (int)args.size();
// The mask is the last parameter in the argument structure
llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
"task_struct_mask");
llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
ctx->SetFunctionMask(ptrval);
if (type->isUnmasked == false) {
// Copy in the mask as well.
int nArgs = (int)args.size();
// The mask is the last parameter in the argument structure
llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
"task_struct_mask");
llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
ctx->SetFunctionMask(ptrval);
}
// Copy threadIndex and threadCount into stack-allocated storage so
// that their symbols point to something reasonable.
@@ -240,13 +262,17 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
llvm::Function::arg_iterator argIter = function->arg_begin();
for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
Symbol *sym = args[i];
if (sym == NULL)
// anonymous function parameter
continue;
argIter->setName(sym->name.c_str());
// Allocate stack storage for the parameter and emit code
// to store the its value there.
sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
ctx->StoreInst(argIter, sym->storagePtr);
ctx->EmitFunctionParameterDebugInfo(sym);
ctx->EmitFunctionParameterDebugInfo(sym, i);
}
// If the number of actual function arguments is equal to the
@@ -254,9 +280,13 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
// don't have a mask parameter, so set it to be all on. This
// happens for exmaple with 'export'ed functions that the app
// calls.
if (argIter == function->arg_end())
if (argIter == function->arg_end()) {
Assert(type->isUnmasked || type->isExported);
ctx->SetFunctionMask(LLVMMaskAllOn);
}
else {
Assert(type->isUnmasked == false);
// Otherwise use the mask to set the entry mask value
argIter->setName("__mask");
Assert(argIter->getType() == LLVMTypes::MaskType);
@@ -279,21 +309,30 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
// on, all off, or mixed. If this is a simple function, then this
// isn't worth the code bloat / overhead.
bool checkMask = (type->isTask == true) ||
((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
(
#if defined(LLVM_3_1)
(function->hasFnAttr(llvm::Attribute::AlwaysInline) == false)
#elif defined(LLVM_3_2)
(function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false)
#else // LLVM 3.3+
(function->getAttributes().getFnAttributes().hasAttribute(llvm::Attribute::AlwaysInline) == false)
#endif
&&
costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
checkMask &= (type->isUnmasked == false);
checkMask &= (g->target.maskingIsFree == false);
checkMask &= (g->opt.disableCoherentControlFlow == false);
if (checkMask) {
llvm::Value *mask = ctx->GetFunctionMask();
llvm::Value *allOn = ctx->All(mask);
llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");
llvm::BasicBlock *bbSomeOn = ctx->CreateBasicBlock("some_on");
// Set up basic blocks for goto targets
ctx->InitializeLabelMap(code);
ctx->BranchInst(bbAllOn, bbNotAll, allOn);
ctx->BranchInst(bbAllOn, bbSomeOn, allOn);
// all on: we've determined dynamically that the mask is all
// on. Set the current mask to "all on" explicitly so that
// codegen for this path can be improved with this knowledge in
@@ -305,23 +344,11 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
if (ctx->GetCurrentBasicBlock())
ctx->ReturnInst();
// not all on: figure out if no instances are running, or if
// some of them are
ctx->SetCurrentBasicBlock(bbNotAll);
ctx->SetFunctionMask(mask);
llvm::BasicBlock *bbNoneOn = ctx->CreateBasicBlock("none_on");
llvm::BasicBlock *bbSomeOn = ctx->CreateBasicBlock("some_on");
llvm::Value *anyOn = ctx->Any(mask);
ctx->BranchInst(bbSomeOn, bbNoneOn, anyOn);
// Everyone is off; get out of here.
ctx->SetCurrentBasicBlock(bbNoneOn);
ctx->ReturnInst();
// some on: reset the mask to the value it had at function
// entry and emit the code. Resetting the mask here is
// important, due to the "all on" setting of it for the path
// above
// not all on: however, at least one lane must be running,
// since we should never run with all off... some on: reset
// the mask to the value it had at function entry and emit the
// code. Resetting the mask here is important, due to the "all
// on" setting of it for the path above.
ctx->SetCurrentBasicBlock(bbSomeOn);
ctx->SetFunctionMask(mask);
@@ -355,7 +382,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
// issue a warning. Also need to warn if it's the entry block for
// the function (in which case it will not have predeccesors but is
// still reachable.)
if (type->GetReturnType() != AtomicType::Void &&
if (Type::Equal(type->GetReturnType(), AtomicType::Void) == false &&
(pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
Warning(sym->pos, "Missing return statement in function returning \"%s\".",
type->rType->GetString().c_str());
@@ -415,19 +442,22 @@ Function::GenerateIR() {
// If the function is 'export'-qualified, emit a second version of
// it without a mask parameter and without name mangling so that
// the application can call it
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
if (type->isExported) {
if (!type->isTask) {
LLVM_TYPE_CONST llvm::FunctionType *ftype =
type->LLVMFunctionType(g->ctx);
llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
std::string functionName = sym->name;
if (g->mangleFunctionsWithTarget)
functionName += std::string("_") + g->target.GetISAString();
llvm::Function *appFunction =
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
#if defined(LLVM_3_1)
appFunction->setDoesNotThrow(true);
#else
appFunction->setDoesNotThrow();
#endif
if (appFunction->getName() != functionName) {
// this was a redefinition for which we already emitted an

4
func.h
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -43,7 +43,7 @@
class Function {
public:
Function(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code);
Function(Symbol *sym, Stmt *code);
const Type *GetReturnType() const;
const FunctionType *GetType() const;

412
ispc.cpp
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -41,27 +41,41 @@
#include "llvmutil.h"
#include <stdio.h>
#ifdef ISPC_IS_WINDOWS
#include <windows.h>
#include <direct.h>
#define strcasecmp stricmp
#include <windows.h>
#include <direct.h>
#define strcasecmp stricmp
#else
#include <sys/types.h>
#include <unistd.h>
#endif
#if defined(LLVM_3_1) || defined(LLVM_3_2)
#include <llvm/LLVMContext.h>
#include <llvm/Module.h>
#include <llvm/Instructions.h>
#else
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Instructions.h>
#endif
#if defined(LLVM_3_1)
#include <llvm/Analysis/DebugInfo.h>
#include <llvm/Analysis/DIBuilder.h>
#else
#include <llvm/DebugInfo.h>
#include <llvm/DIBuilder.h>
#endif
#include <llvm/LLVMContext.h>
#include <llvm/Module.h>
#include <llvm/Analysis/DIBuilder.h>
#include <llvm/Analysis/DebugInfo.h>
#include <llvm/Support/Dwarf.h>
#include <llvm/Instructions.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/Target/TargetOptions.h>
#include <llvm/Target/TargetData.h>
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
#include <llvm/Support/TargetRegistry.h>
#include <llvm/Support/TargetSelect.h>
#else
#include <llvm/Target/TargetRegistry.h>
#include <llvm/Target/TargetSelect.h>
#include <llvm/Target/SubtargetFeature.h>
#if defined(LLVM_3_1)
#include <llvm/Target/TargetData.h>
#elif defined(LLVM_3_2)
#include <llvm/DataLayout.h>
#else // LLVM 3.3+
#include <llvm/IR/DataLayout.h>
#endif
#include <llvm/Support/TargetRegistry.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Support/Host.h>
Globals *g;
@@ -70,31 +84,124 @@ Module *m;
///////////////////////////////////////////////////////////////////////////
// Target
#ifndef ISPC_IS_WINDOWS
static void __cpuid(int info[4], int infoType) {
__asm__ __volatile__ ("cpuid"
: "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
: "0" (infoType));
}
/* Save %ebx in case it's the PIC register */
static void __cpuidex(int info[4], int level, int count) {
__asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
"cpuid\n\t"
"xchg{l}\t{%%}ebx, %1\n\t"
: "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
: "0" (level), "2" (count));
}
#endif // ISPC_IS_WINDOWS
static const char *
lGetSystemISA() {
int info[4];
__cpuid(info, 1);
if ((info[2] & (1 << 28)) != 0) { // AVX
// AVX1 for sure....
// Ivy Bridge?
if ((info[2] & (1 << 29)) != 0 && // F16C
(info[2] & (1 << 30)) != 0) { // RDRAND
// So far, so good. AVX2?
// Call cpuid with eax=7, ecx=0
int info2[4];
__cpuidex(info2, 7, 0);
if ((info2[1] & (1 << 5)) != 0)
return "avx2";
else
return "avx1.1";
}
// Regular AVX
return "avx";
}
else if ((info[2] & (1 << 19)) != 0)
return "sse4";
else if ((info[3] & (1 << 26)) != 0)
return "sse2";
else {
fprintf(stderr, "Unable to detect supported SSE/AVX ISA. Exiting.\n");
exit(1);
}
}
static const char *supportedCPUs[] = {
"atom", "penryn", "core2", "corei7", "corei7-avx"
#if defined(LLVM_3_2) || defined(LLVM_3_3)
, "core-avx-i", "core-avx2"
#endif // LLVM_3_2 or LLVM_3_3
};
bool
Target::GetTarget(const char *arch, const char *cpu, const char *isa,
bool pic, Target *t) {
if (isa == NULL) {
if (cpu != NULL) {
// If a CPU was specified explicitly, try to pick the best
// possible ISA based on that.
if (!strcmp(cpu, "core-avx2"))
isa = "avx2";
else if (!strcmp(cpu, "core-avx-i"))
isa = "avx1.1";
else if (!strcmp(cpu, "sandybridge") ||
!strcmp(cpu, "corei7-avx"))
isa = "avx";
else if (!strcmp(cpu, "corei7") ||
!strcmp(cpu, "penryn"))
isa = "sse4";
else
isa = "sse2";
Warning(SourcePos(), "No --target specified on command-line. "
"Using ISA \"%s\" based on specified CPU \"%s\".", isa,
cpu);
}
else {
// No CPU and no ISA, so use CPUID to figure out what this CPU
// supports.
isa = lGetSystemISA();
Warning(SourcePos(), "No --target specified on command-line. "
"Using system ISA \"%s\".", isa);
}
}
if (cpu == NULL) {
std::string hostCPU = llvm::sys::getHostCPUName();
if (hostCPU.size() > 0)
cpu = strdup(hostCPU.c_str());
else {
fprintf(stderr, "Warning: unable to determine host CPU!\n");
Warning(SourcePos(), "Unable to determine host CPU!\n");
cpu = "generic";
}
}
else {
bool foundCPU = false;
for (int i = 0; i < int(sizeof(supportedCPUs) / sizeof(supportedCPUs[0]));
++i) {
if (!strcmp(cpu, supportedCPUs[i])) {
foundCPU = true;
break;
}
}
if (foundCPU == false) {
fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
"%s.\n", cpu, SupportedTargetCPUs().c_str());
return false;
}
}
t->cpu = cpu;
if (isa == NULL) {
if (!strcasecmp(cpu, "atom"))
isa = "sse2";
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
else if (!strcasecmp(cpu, "sandybridge") ||
!strcasecmp(cpu, "corei7-avx"))
isa = "avx";
#endif // LLVM_3_0
else
isa = "sse4";
}
if (arch == NULL)
arch = "x86-64";
@@ -125,13 +232,16 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->arch = arch;
}
// This is the case for most of them
t->hasHalf = t->hasRand = t->hasTranscendentals = false;
t->hasGather = t->hasScatter = false;
if (!strcasecmp(isa, "sse2")) {
t->isa = Target::SSE2;
t->nativeVectorWidth = 4;
t->vectorWidth = 4;
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
}
else if (!strcasecmp(isa, "sse2-x2")) {
@@ -140,7 +250,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->vectorWidth = 8;
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
}
else if (!strcasecmp(isa, "sse4")) {
@@ -149,7 +258,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->vectorWidth = 4;
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
}
else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
@@ -158,7 +266,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->vectorWidth = 8;
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
}
else if (!strcasecmp(isa, "generic-4")) {
@@ -166,73 +273,136 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->nativeVectorWidth = 4;
t->vectorWidth = 4;
t->maskingIsFree = true;
t->allOffMaskIsSafe = true;
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-8")) {
t->isa = Target::GENERIC;
t->nativeVectorWidth = 8;
t->vectorWidth = 8;
t->maskingIsFree = true;
t->allOffMaskIsSafe = true;
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-16")) {
t->isa = Target::GENERIC;
t->nativeVectorWidth = 16;
t->vectorWidth = 16;
t->maskingIsFree = true;
t->allOffMaskIsSafe = true;
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-32")) {
t->isa = Target::GENERIC;
t->nativeVectorWidth = 32;
t->vectorWidth = 32;
t->maskingIsFree = true;
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-64")) {
t->isa = Target::GENERIC;
t->nativeVectorWidth = 64;
t->vectorWidth = 64;
t->maskingIsFree = true;
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-1")) {
t->isa = Target::GENERIC;
t->nativeVectorWidth = 1;
t->vectorWidth = 1;
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
}
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
else if (!strcasecmp(isa, "avx")) {
else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) {
t->isa = Target::AVX;
t->nativeVectorWidth = 8;
t->vectorWidth = 8;
t->attributes = "+avx,+popcnt,+cmov";
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
}
else if (!strcasecmp(isa, "avx-x2")) {
else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) {
t->isa = Target::AVX;
t->nativeVectorWidth = 8;
t->vectorWidth = 16;
t->attributes = "+avx,+popcnt,+cmov";
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
}
#endif // LLVM 3.0+
#if defined(LLVM_3_1svn)
else if (!strcasecmp(isa, "avx1.1")) {
t->isa = Target::AVX11;
t->nativeVectorWidth = 8;
t->vectorWidth = 8;
t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
t->maskingIsFree = false;
t->maskBitCount = 32;
t->hasHalf = true;
#if !defined(LLVM_3_1)
// LLVM 3.2+ only
t->hasRand = true;
#endif
}
else if (!strcasecmp(isa, "avx1.1-x2")) {
t->isa = Target::AVX11;
t->nativeVectorWidth = 8;
t->vectorWidth = 16;
t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
t->maskingIsFree = false;
t->maskBitCount = 32;
t->hasHalf = true;
#if !defined(LLVM_3_1)
// LLVM 3.2+ only
t->hasRand = true;
#endif
}
else if (!strcasecmp(isa, "avx2")) {
t->isa = Target::AVX2;
t->nativeVectorWidth = 8;
t->vectorWidth = 8;
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
t->attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
#ifndef LLVM_3_1
",+fma"
#endif // !LLVM_3_1
;
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
t->hasHalf = true;
#if !defined(LLVM_3_1)
// LLVM 3.2+ only
t->hasRand = true;
t->hasGather = true;
#endif
}
else if (!strcasecmp(isa, "avx2-x2")) {
t->isa = Target::AVX2;
t->nativeVectorWidth = 16;
t->vectorWidth = 16;
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
t->attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
#ifndef LLVM_3_1
",+fma"
#endif // !LLVM_3_1
;
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
t->hasHalf = true;
#if !defined(LLVM_3_1)
// LLVM 3.2+ only
t->hasRand = true;
t->hasGather = true;
#endif
}
#endif // LLVM 3.1
else {
fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n",
isa, SupportedTargetISAs());
@@ -241,25 +411,31 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
if (!error) {
llvm::TargetMachine *targetMachine = t->GetTargetMachine();
#if defined(LLVM_3_1)
const llvm::TargetData *targetData = targetMachine->getTargetData();
t->is32Bit = (targetData->getPointerSize() == 4);
#else
int addressSpace = 0;
const llvm::DataLayout *dataLayout = targetMachine->getDataLayout();
t->is32Bit = (dataLayout->getPointerSize(addressSpace) == 4);
#endif
Assert(t->vectorWidth <= ISPC_MAX_NVEC);
}
return !error;
}
const char *
std::string
Target::SupportedTargetCPUs() {
return "atom, barcelona, core2, corei7, "
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
"corei7-avx, "
#endif
"istanbul, nocona, penryn, "
#ifdef LLVM_2_9
"sandybridge, "
#endif
"westmere";
std::string ret;
int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
for (int i = 0; i < count; ++i) {
ret += supportedCPUs[i];
if (i != count - 1)
ret += ", ";
}
return ret;
}
@@ -271,14 +447,9 @@ Target::SupportedTargetArchs() {
const char *
Target::SupportedTargetISAs() {
return "sse2, sse2-x2, sse4, sse4-x2"
#ifndef LLVM_2_9
", avx, avx-x2"
#endif // !LLVM_2_9
#ifdef LLVM_3_1svn
", avx2, avx2-x2"
#endif // LLVM_3_1svn
", generic-4, generic-8, generic-16, generic-1";
return "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
", avx1.1, avx1.1-x2, avx2, avx2-x2"
", generic-1, generic-4, generic-8, generic-16, generic-32";
}
@@ -286,11 +457,7 @@ std::string
Target::GetTripleString() const {
llvm::Triple triple;
// Start with the host triple as the default
#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
triple.setTriple(llvm::sys::getDefaultTargetTriple());
#else
triple.setTriple(llvm::sys::getHostTriple());
#endif
// And override the arch in the host triple based on what the user
// specified. Here we need to deal with the fact that LLVM uses one
@@ -315,30 +482,15 @@ Target::GetTargetMachine() const {
llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ :
llvm::Reloc::Default;
#if defined(LLVM_3_1svn)
std::string featuresString = attributes;
llvm::TargetOptions options;
if (g->opt.fastMath == true)
options.UnsafeFPMath = 1;
#if !defined(LLVM_3_1)
if (g->opt.disableFMA == false)
options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
#endif // !LLVM_3_1
llvm::TargetMachine *targetMachine =
target->createTargetMachine(triple, cpu, featuresString, options,
relocModel);
#elif defined(LLVM_3_0)
std::string featuresString = attributes;
llvm::TargetMachine *targetMachine =
target->createTargetMachine(triple, cpu, featuresString, relocModel);
#else // LLVM 2.9
#ifdef ISPC_IS_APPLE
relocModel = llvm::Reloc::PIC_;
#endif // ISPC_IS_APPLE
std::string featuresString = cpu + std::string(",") + attributes;
llvm::TargetMachine *targetMachine =
target->createTargetMachine(triple, featuresString);
#ifndef ISPC_IS_WINDOWS
targetMachine->setRelocationModel(relocModel);
#endif // !ISPC_IS_WINDOWS
#endif // LLVM_2_9
Assert(targetMachine != NULL);
targetMachine->setAsmVerbosityDefault(true);
@@ -355,6 +507,8 @@ Target::GetISAString() const {
return "sse4";
case Target::AVX:
return "avx";
case Target::AVX11:
return "avx11";
case Target::AVX2:
return "avx2";
case Target::GENERIC:
@@ -367,7 +521,7 @@ Target::GetISAString() const {
static bool
lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
lGenericTypeLayoutIndeterminate(llvm::Type *type) {
if (type->isPrimitiveType() || type->isIntegerTy())
return false;
@@ -376,18 +530,18 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
type == LLVMTypes::Int1VectorType)
return true;
LLVM_TYPE_CONST llvm::ArrayType *at =
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
llvm::ArrayType *at =
llvm::dyn_cast<llvm::ArrayType>(type);
if (at != NULL)
return lGenericTypeLayoutIndeterminate(at->getElementType());
LLVM_TYPE_CONST llvm::PointerType *pt =
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
llvm::PointerType *pt =
llvm::dyn_cast<llvm::PointerType>(type);
if (pt != NULL)
return false;
LLVM_TYPE_CONST llvm::StructType *st =
llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
llvm::StructType *st =
llvm::dyn_cast<llvm::StructType>(type);
if (st != NULL) {
for (int i = 0; i < (int)st->getNumElements(); ++i)
if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
@@ -395,29 +549,24 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
return false;
}
Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
Assert(llvm::isa<llvm::VectorType>(type));
return true;
}
llvm::Value *
Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
Target::SizeOf(llvm::Type *type,
llvm::BasicBlock *insertAtEnd) {
if (isa == Target::GENERIC &&
lGenericTypeLayoutIndeterminate(type)) {
llvm::Value *index[1] = { LLVMInt32(1) };
LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
llvm::Instruction *gep =
llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "sizeof_gep",
insertAtEnd);
#else
llvm::Instruction *gep =
llvm::GetElementPtrInst::Create(voidPtr, &index[0], &index[1],
"sizeof_gep", insertAtEnd);
#endif
if (is32Bit || g->opt.force32BitAddressing)
return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type,
"sizeof_int", insertAtEnd);
@@ -426,9 +575,18 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
"sizeof_int", insertAtEnd);
}
#if defined(LLVM_3_1)
const llvm::TargetData *td = GetTargetMachine()->getTargetData();
Assert(td != NULL);
uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
uint64_t bitSize = td->getTypeSizeInBits(type);
#else
const llvm::DataLayout *dl = GetTargetMachine()->getDataLayout();
Assert(dl != NULL);
uint64_t bitSize = dl->getTypeSizeInBits(type);
#endif
Assert((bitSize % 8) == 0);
uint64_t byteSize = bitSize / 8;
if (is32Bit || g->opt.force32BitAddressing)
return LLVMInt32((int32_t)byteSize);
else
@@ -437,23 +595,18 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
llvm::Value *
Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
Target::StructOffset(llvm::Type *type, int element,
llvm::BasicBlock *insertAtEnd) {
if (isa == Target::GENERIC &&
lGenericTypeLayoutIndeterminate(type) == true) {
llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
llvm::Instruction *gep =
llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "offset_gep",
insertAtEnd);
#else
llvm::Instruction *gep =
llvm::GetElementPtrInst::Create(voidPtr, &indices[0], &indices[2],
"offset_gep", insertAtEnd);
#endif
if (is32Bit || g->opt.force32BitAddressing)
return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type,
"offset_int", insertAtEnd);
@@ -462,12 +615,22 @@ Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
"offset_int", insertAtEnd);
}
llvm::StructType *structType =
llvm::dyn_cast<llvm::StructType>(type);
if (structType == NULL || structType->isSized() == false) {
Assert(m->errorCount > 0);
return NULL;
}
#if defined(LLVM_3_1)
const llvm::TargetData *td = GetTargetMachine()->getTargetData();
Assert(td != NULL);
LLVM_TYPE_CONST llvm::StructType *structType =
llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
Assert(structType != NULL);
const llvm::StructLayout *sl = td->getStructLayout(structType);
#else
const llvm::DataLayout *dl = GetTargetMachine()->getDataLayout();
Assert(dl != NULL);
const llvm::StructLayout *sl = dl->getStructLayout(structType);
#endif
Assert(sl != NULL);
uint64_t offset = sl->getElementOffset(element);
@@ -488,6 +651,8 @@ Opt::Opt() {
force32BitAddressing = true;
unrollLoops = true;
disableAsserts = false;
disableFMA = false;
forceAlignedMemory = false;
disableMaskAllOnOptimizations = false;
disableHandlePseudoMemoryOps = false;
disableBlendedMaskedStores = false;
@@ -497,6 +662,7 @@ Opt::Opt() {
disableMaskedStoreToStore = false;
disableGatherScatterFlattening = false;
disableUniformMemoryOptimizations = false;
disableCoalescing = false;
}
///////////////////////////////////////////////////////////////////////////
@@ -510,12 +676,16 @@ Globals::Globals() {
debugPrint = false;
disableWarnings = false;
warningsAsErrors = false;
quiet = false;
forceColoredOutput = false;
disableLineWrap = false;
emitPerfWarnings = true;
emitInstrumentation = false;
generateDebuggingSymbols = false;
enableFuzzTest = false;
fuzzTestSeed = -1;
mangleFunctionsWithTarget = false;
ctx = new llvm::LLVMContext;
#ifdef ISPC_IS_WINDOWS
@@ -548,7 +718,9 @@ llvm::DIFile
SourcePos::GetDIFile() const {
std::string directory, filename;
GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
return m->diBuilder->createFile(filename, directory);
llvm::DIFile ret = m->diBuilder->createFile(filename, directory);
Assert(ret.Verify());
return ret;
}

120
ispc.h
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,10 @@
#ifndef ISPC_H
#define ISPC_H
#if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
#error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
#define ISPC_VERSION "1.3.1dev"
#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
#error "Only LLVM 3.1, 3.2 and the 3.3 development branch are supported"
#endif
#if defined(_WIN32) || defined(_WIN64)
@@ -49,6 +51,9 @@
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#endif
#if defined(__KNC__)
#define ISPC_IS_KNC
#endif
#include <stdint.h>
#include <stdlib.h>
@@ -56,20 +61,10 @@
#include <vector>
#include <string>
#define Assert(expr) \
((void)((expr) ? 0 : __Assert (#expr, __FILE__, __LINE__)))
#define __Assert(expr, file, line) \
((void)fprintf(stderr, "%s:%u: Assertion failed: \"%s\"\n" \
"***\n*** Please file a bug report at " \
"https://github.com/ispc/ispc/issues\n*** (Including as much " \
"information as you can about how to reproduce this error).\n" \
"*** You have apparently encountered a bug in the compiler that " \
"we'd like to fix!\n***\n", file, line, expr), abort(), 0)
/** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
targets.
*/
#define ISPC_MAX_NVEC 16
#define ISPC_MAX_NVEC 64
// Forward declarations of a number of widely-used LLVM types
namespace llvm {
@@ -90,12 +85,6 @@ namespace llvm {
class Value;
}
// llvm::Type *s are no longer const in llvm 3.0
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
#define LLVM_TYPE_CONST
#else
#define LLVM_TYPE_CONST const
#endif
class ArrayType;
class AST;
@@ -107,12 +96,22 @@ class ExprList;
class Function;
class FunctionType;
class Module;
class PointerType;
class Stmt;
class Symbol;
class SymbolTable;
class Type;
struct VariableDeclaration;
enum StorageClass {
SC_NONE,
SC_EXTERN,
SC_STATIC,
SC_TYPEDEF,
SC_EXTERN_C
};
/** @brief Representation of a range of positions in a source file.
This class represents a range of characters in a source file
@@ -139,11 +138,25 @@ struct SourcePos {
bool operator==(const SourcePos &p2) const;
};
/** Returns a SourcePos that encompasses the extent of both of the given
extents. */
SourcePos Union(const SourcePos &p1, const SourcePos &p2);
// Assert
extern void DoAssert(const char *file, int line, const char *expr);
extern void DoAssertPos(SourcePos pos, const char *file, int line, const char *expr);
#define Assert(expr) \
((void)((expr) ? 0 : ((void)DoAssert (__FILE__, __LINE__, #expr), 0)))
#define AssertPos(pos, expr) \
((void)((expr) ? 0 : ((void)DoAssertPos (pos, __FILE__, __LINE__, #expr), 0)))
/** @brief Structure that defines a compilation target
This structure defines a compilation target for the ispc compiler.
@@ -161,7 +174,7 @@ struct Target {
/** Returns a comma-delimited string giving the names of the currently
supported target CPUs. */
static const char *SupportedTargetCPUs();
static std::string SupportedTargetCPUs();
/** Returns a comma-delimited string giving the names of the currently
supported target architectures. */
@@ -179,13 +192,13 @@ struct Target {
const char *GetISAString() const;
/** Returns the size of the given type */
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type,
llvm::Value *SizeOf(llvm::Type *type,
llvm::BasicBlock *insertAtEnd);
/** Given a structure type and an element number in the structure,
returns a value corresponding to the number of bytes from the start
of the structure where the element is located. */
llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
llvm::Value *StructOffset(llvm::Type *type,
int element, llvm::BasicBlock *insertAtEnd);
/** llvm Target object representing this target. */
@@ -197,7 +210,7 @@ struct Target {
flexible/performant of them will apear last in the enumerant. Note
also that __best_available_isa() needs to be updated if ISAs are
added or the enumerant values are reordered. */
enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS };
enum ISA { SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
/** Instruction set being compiled to. */
ISA isa;
@@ -233,16 +246,27 @@ struct Target {
natively. */
bool maskingIsFree;
/** Is it safe to run code with the mask all if: e.g. on SSE, the fast
gather trick assumes that at least one program instance is running
(so that it can safely assume that the array base pointer is
valid). */
bool allOffMaskIsSafe;
/** How many bits are used to store each element of the mask: e.g. this
is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
the generic target. */
int maskBitCount;
/** Indicates whether the target has native support for float/half
conversions. */
bool hasHalf;
/** Indicates whether there is an ISA random number instruction. */
bool hasRand;
/** Indicates whether the target has a native gather instruction */
bool hasGather;
/** Indicates whether the target has a native scatter instruction */
bool hasScatter;
/** Indicates whether the target has support for transcendentals (beyond
sqrt, which we assume that all of them handle). */
bool hasTranscendentals;
};
@@ -283,6 +307,16 @@ struct Opt {
performance in the generated code). */
bool disableAsserts;
/** Indicates whether FMA instructions should be disabled (on targets
that support them). */
bool disableFMA;
/** Always generate aligned vector load/store instructions; this
implies a guarantee that all dynamic access through pointers that
becomes a vector load/store will be a cache-aligned sequence of
locations. */
bool forceAlignedMemory;
/** If enabled, disables the various optimizations that kick in when
the execution mask can be determined to be "all on" at compile
time. */
@@ -339,6 +373,10 @@ struct Opt {
than gathers/scatters. This is likely only useful for measuring
the impact of this optimization. */
bool disableUniformMemoryOptimizations;
/** Disables optimizations that coalesce incoherent scalar memory
access from gathers into wider vector operations, when possible. */
bool disableCoalescing;
};
/** @brief This structure collects together a number of global variables.
@@ -388,6 +426,13 @@ struct Globals {
possible performance pitfalls. */
bool emitPerfWarnings;
/** Indicates whether all printed output should be surpressed. */
bool quiet;
/** Always use ANSI escape sequences to colorize warning and error
messages, even if piping output to a file, etc. */
bool forceColoredOutput;
/** Indicates whether calls should be emitted in the program to an
externally-defined program instrumentation function. (See the
"Instrumenting your ispc programs" section in the user's
@@ -402,6 +447,14 @@ struct Globals {
vector width to them. */
bool mangleFunctionsWithTarget;
/** If enabled, the lexer will randomly replace some tokens returned
with other tokens, in order to test error condition handling in the
compiler. */
bool enableFuzzTest;
/** Seed for random number generator used for fuzz testing. */
int fuzzTestSeed;
/** Global LLVMContext object */
llvm::LLVMContext *ctx;
@@ -412,11 +465,14 @@ struct Globals {
/** Arguments to pass along to the C pre-processor, if it is run on the
program before compilation. */
std::vector<std::string> cppArgs;
/** Additional user-provided directories to search when processing
#include directives in the preprocessor. */
std::vector<std::string> includePath;
};
enum {
COST_ASSIGN = 1,
COST_COHERENT_BREAK_CONTINE = 4,
COST_COMPLEX_ARITH_OP = 4,
COST_DELETE = 32,
COST_DEREF = 4,
@@ -427,7 +483,7 @@ enum {
COST_GOTO = 4,
COST_LOAD = 2,
COST_NEW = 32,
COST_REGULAR_BREAK_CONTINUE = 2,
COST_BREAK_CONTINUE = 3,
COST_RETURN = 4,
COST_SELECT = 4,
COST_SIMPLE_ARITH_LOGIC_OP = 1,

View File

@@ -20,6 +20,8 @@
<ClCompile Include="func.cpp" />
<ClCompile Include="gen-bitcode-avx1.cpp" />
<ClCompile Include="gen-bitcode-avx1-x2.cpp" />
<ClCompile Include="gen-bitcode-avx11.cpp" />
<ClCompile Include="gen-bitcode-avx11-x2.cpp" />
<ClCompile Include="gen-bitcode-avx2.cpp" />
<ClCompile Include="gen-bitcode-avx2-x2.cpp" />
<ClCompile Include="gen-bitcode-c-32.cpp" />
@@ -29,6 +31,8 @@
<ClCompile Include="gen-bitcode-generic-4.cpp" />
<ClCompile Include="gen-bitcode-generic-8.cpp" />
<ClCompile Include="gen-bitcode-generic-16.cpp" />
<ClCompile Include="gen-bitcode-generic-32.cpp" />
<ClCompile Include="gen-bitcode-generic-64.cpp" />
<ClCompile Include="gen-bitcode-sse2.cpp" />
<ClCompile Include="gen-bitcode-sse2-x2.cpp" />
<ClCompile Include="gen-bitcode-sse4.cpp" />
@@ -186,6 +190,32 @@
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-avx11.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll &gt; gen-bitcode-avx11.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx11.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll &gt; gen-bitcode-avx11.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx11.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx11.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx11.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-avx11-x2.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll &gt; gen-bitcode-avx11-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx11-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll &gt; gen-bitcode-avx11-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx11-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx11-x2.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx11-x2.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-avx2.ll">
<FileType>Document</FileType>
@@ -264,6 +294,32 @@
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-generic-32.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-32.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-32.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-32.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-32.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-generic-64.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll &gt; gen-bitcode-generic-64.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-64.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll &gt; gen-bitcode-generic-64.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-64.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-64.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-64.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="lex.ll">
<FileType>Document</FileType>
@@ -324,7 +380,7 @@
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
<PreprocessorDefinitions>NOMINMAX;%LLVM_VERSION%</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
</ClCompile>
@@ -332,7 +388,7 @@
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -342,7 +398,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
<PreprocessorDefinitions>NOMINMAX;%LLVM_VERSION%</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
</ClCompile>
@@ -352,7 +408,7 @@
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

652
lex.ll
View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -43,31 +43,294 @@
#include <stdint.h>
static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
static int lParseInteger(bool dotdotdot);
static void lCComment(SourcePos *);
static void lCppComment(SourcePos *);
static void lHandleCppHash(SourcePos *);
static void lStringConst(YYSTYPE *, SourcePos *);
static double lParseHexFloat(const char *ptr);
extern void RegisterDependency(const std::string &fileName);
#define YY_USER_ACTION \
yylloc->first_line = yylloc->last_line; \
yylloc->first_column = yylloc->last_column; \
yylloc->last_column += yyleng;
yylloc.first_line = yylloc.last_line; \
yylloc.first_column = yylloc.last_column; \
yylloc.last_column += yyleng;
#ifdef ISPC_IS_WINDOWS
inline int isatty(int) { return 0; }
#else
#include <unistd.h>
#endif // ISPC_IS_WINDOWS
static int allTokens[] = {
TOKEN_ASSERT, TOKEN_BOOL, TOKEN_BREAK, TOKEN_CASE,
TOKEN_CDO, TOKEN_CFOR, TOKEN_CIF, TOKEN_CWHILE,
TOKEN_CONST, TOKEN_CONTINUE, TOKEN_DEFAULT, TOKEN_DO,
TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
TOKEN_FOREACH, TOKEN_FOREACH_ACTIVE, TOKEN_FOREACH_TILED,
TOKEN_FOREACH_UNIQUE, TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
TOKEN_FLOAT_CONSTANT,
TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT,
TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
TOKEN_GE_OP, TOKEN_EQ_OP, TOKEN_NE_OP, TOKEN_AND_OP, TOKEN_OR_OP,
TOKEN_MUL_ASSIGN, TOKEN_DIV_ASSIGN, TOKEN_MOD_ASSIGN, TOKEN_ADD_ASSIGN,
TOKEN_SUB_ASSIGN, TOKEN_LEFT_ASSIGN, TOKEN_RIGHT_ASSIGN, TOKEN_AND_ASSIGN,
TOKEN_XOR_ASSIGN, TOKEN_OR_ASSIGN, TOKEN_PTR_OP,
';', '{', '}', ',', ':', '=', '(', ')', '[', ']', '.', '&', '!', '~', '-',
'+', '*', '/', '%', '<', '>', '^', '|', '?',
};
std::map<int, std::string> tokenToName;
std::map<std::string, std::string> tokenNameRemap;
void ParserInit() {
tokenToName[TOKEN_ASSERT] = "assert";
tokenToName[TOKEN_BOOL] = "bool";
tokenToName[TOKEN_BREAK] = "break";
tokenToName[TOKEN_CASE] = "case";
tokenToName[TOKEN_CDO] = "cdo";
tokenToName[TOKEN_CFOR] = "cfor";
tokenToName[TOKEN_CIF] = "cif";
tokenToName[TOKEN_CWHILE] = "cwhile";
tokenToName[TOKEN_CONST] = "const";
tokenToName[TOKEN_CONTINUE] = "continue";
tokenToName[TOKEN_DEFAULT] = "default";
tokenToName[TOKEN_DO] = "do";
tokenToName[TOKEN_DELETE] = "delete";
tokenToName[TOKEN_DOUBLE] = "double";
tokenToName[TOKEN_ELSE] = "else";
tokenToName[TOKEN_ENUM] = "enum";
tokenToName[TOKEN_EXPORT] = "export";
tokenToName[TOKEN_EXTERN] = "extern";
tokenToName[TOKEN_FALSE] = "false";
tokenToName[TOKEN_FLOAT] = "float";
tokenToName[TOKEN_FOR] = "for";
tokenToName[TOKEN_FOREACH] = "foreach";
tokenToName[TOKEN_FOREACH_ACTIVE] = "foreach_active";
tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
tokenToName[TOKEN_FOREACH_UNIQUE] = "foreach_unique";
tokenToName[TOKEN_GOTO] = "goto";
tokenToName[TOKEN_IF] = "if";
tokenToName[TOKEN_IN] = "in";
tokenToName[TOKEN_INLINE] = "inline";
tokenToName[TOKEN_INT] = "int";
tokenToName[TOKEN_INT8] = "int8";
tokenToName[TOKEN_INT16] = "int16";
tokenToName[TOKEN_INT] = "int";
tokenToName[TOKEN_INT64] = "int64";
tokenToName[TOKEN_LAUNCH] = "launch";
tokenToName[TOKEN_NEW] = "new";
tokenToName[TOKEN_NULL] = "NULL";
tokenToName[TOKEN_PRINT] = "print";
tokenToName[TOKEN_RETURN] = "return";
tokenToName[TOKEN_SOA] = "soa";
tokenToName[TOKEN_SIGNED] = "signed";
tokenToName[TOKEN_SIZEOF] = "sizeof";
tokenToName[TOKEN_STATIC] = "static";
tokenToName[TOKEN_STRUCT] = "struct";
tokenToName[TOKEN_SWITCH] = "switch";
tokenToName[TOKEN_SYNC] = "sync";
tokenToName[TOKEN_TASK] = "task";
tokenToName[TOKEN_TRUE] = "true";
tokenToName[TOKEN_TYPEDEF] = "typedef";
tokenToName[TOKEN_UNIFORM] = "uniform";
tokenToName[TOKEN_UNMASKED] = "unmasked";
tokenToName[TOKEN_UNSIGNED] = "unsigned";
tokenToName[TOKEN_VARYING] = "varying";
tokenToName[TOKEN_VOID] = "void";
tokenToName[TOKEN_WHILE] = "while";
tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
tokenToName[TOKEN_DOTDOTDOT] = "...";
tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
tokenToName[TOKEN_UINT64_CONSTANT] = "TOKEN_UINT64_CONSTANT";
tokenToName[TOKEN_INC_OP] = "++";
tokenToName[TOKEN_DEC_OP] = "--";
tokenToName[TOKEN_LEFT_OP] = "<<";
tokenToName[TOKEN_RIGHT_OP] = ">>";
tokenToName[TOKEN_LE_OP] = "<=";
tokenToName[TOKEN_GE_OP] = ">=";
tokenToName[TOKEN_EQ_OP] = "==";
tokenToName[TOKEN_NE_OP] = "!=";
tokenToName[TOKEN_AND_OP] = "&&";
tokenToName[TOKEN_OR_OP] = "||";
tokenToName[TOKEN_MUL_ASSIGN] = "*=";
tokenToName[TOKEN_DIV_ASSIGN] = "/=";
tokenToName[TOKEN_MOD_ASSIGN] = "%=";
tokenToName[TOKEN_ADD_ASSIGN] = "+=";
tokenToName[TOKEN_SUB_ASSIGN] = "-=";
tokenToName[TOKEN_LEFT_ASSIGN] = "<<=";
tokenToName[TOKEN_RIGHT_ASSIGN] = ">>=";
tokenToName[TOKEN_AND_ASSIGN] = "&=";
tokenToName[TOKEN_XOR_ASSIGN] = "^=";
tokenToName[TOKEN_OR_ASSIGN] = "|=";
tokenToName[TOKEN_PTR_OP] = "->";
tokenToName[';'] = ";";
tokenToName['{'] = "{";
tokenToName['}'] = "}";
tokenToName[','] = ",";
tokenToName[':'] = ":";
tokenToName['='] = "=";
tokenToName['('] = "(";
tokenToName[')'] = ")";
tokenToName['['] = "[";
tokenToName[']'] = "]";
tokenToName['.'] = ".";
tokenToName['&'] = "&";
tokenToName['!'] = "!";
tokenToName['~'] = "~";
tokenToName['-'] = "-";
tokenToName['+'] = "+";
tokenToName['*'] = "*";
tokenToName['/'] = "/";
tokenToName['%'] = "%";
tokenToName['<'] = "<";
tokenToName['>'] = ">";
tokenToName['^'] = "^";
tokenToName['|'] = "|";
tokenToName['?'] = "?";
tokenToName[';'] = ";";
tokenNameRemap["TOKEN_ASSERT"] = "\'assert\'";
tokenNameRemap["TOKEN_BOOL"] = "\'bool\'";
tokenNameRemap["TOKEN_BREAK"] = "\'break\'";
tokenNameRemap["TOKEN_CASE"] = "\'case\'";
tokenNameRemap["TOKEN_CDO"] = "\'cdo\'";
tokenNameRemap["TOKEN_CFOR"] = "\'cfor\'";
tokenNameRemap["TOKEN_CIF"] = "\'cif\'";
tokenNameRemap["TOKEN_CWHILE"] = "\'cwhile\'";
tokenNameRemap["TOKEN_CONST"] = "\'const\'";
tokenNameRemap["TOKEN_CONTINUE"] = "\'continue\'";
tokenNameRemap["TOKEN_DEFAULT"] = "\'default\'";
tokenNameRemap["TOKEN_DO"] = "\'do\'";
tokenNameRemap["TOKEN_DELETE"] = "\'delete\'";
tokenNameRemap["TOKEN_DOUBLE"] = "\'double\'";
tokenNameRemap["TOKEN_ELSE"] = "\'else\'";
tokenNameRemap["TOKEN_ENUM"] = "\'enum\'";
tokenNameRemap["TOKEN_EXPORT"] = "\'export\'";
tokenNameRemap["TOKEN_EXTERN"] = "\'extern\'";
tokenNameRemap["TOKEN_FALSE"] = "\'false\'";
tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
tokenNameRemap["TOKEN_FOR"] = "\'for\'";
tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
tokenNameRemap["TOKEN_FOREACH_ACTIVE"] = "\'foreach_active\'";
tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
tokenNameRemap["TOKEN_FOREACH_UNIQUE"] = "\'foreach_unique\'";
tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
tokenNameRemap["TOKEN_IDENTIFIER"] = "identifier";
tokenNameRemap["TOKEN_IF"] = "\'if\'";
tokenNameRemap["TOKEN_IN"] = "\'in\'";
tokenNameRemap["TOKEN_INLINE"] = "\'inline\'";
tokenNameRemap["TOKEN_INT"] = "\'int\'";
tokenNameRemap["TOKEN_INT8"] = "\'int8\'";
tokenNameRemap["TOKEN_INT16"] = "\'int16\'";
tokenNameRemap["TOKEN_INT"] = "\'int\'";
tokenNameRemap["TOKEN_INT64"] = "\'int64\'";
tokenNameRemap["TOKEN_LAUNCH"] = "\'launch\'";
tokenNameRemap["TOKEN_NEW"] = "\'new\'";
tokenNameRemap["TOKEN_NULL"] = "\'NULL\'";
tokenNameRemap["TOKEN_PRINT"] = "\'print\'";
tokenNameRemap["TOKEN_RETURN"] = "\'return\'";
tokenNameRemap["TOKEN_SOA"] = "\'soa\'";
tokenNameRemap["TOKEN_SIGNED"] = "\'signed\'";
tokenNameRemap["TOKEN_SIZEOF"] = "\'sizeof\'";
tokenNameRemap["TOKEN_STATIC"] = "\'static\'";
tokenNameRemap["TOKEN_STRUCT"] = "\'struct\'";
tokenNameRemap["TOKEN_SWITCH"] = "\'switch\'";
tokenNameRemap["TOKEN_SYNC"] = "\'sync\'";
tokenNameRemap["TOKEN_TASK"] = "\'task\'";
tokenNameRemap["TOKEN_TRUE"] = "\'true\'";
tokenNameRemap["TOKEN_TYPEDEF"] = "\'typedef\'";
tokenNameRemap["TOKEN_UNIFORM"] = "\'uniform\'";
tokenNameRemap["TOKEN_UNMASKED"] = "\'unmasked\'";
tokenNameRemap["TOKEN_UNSIGNED"] = "\'unsigned\'";
tokenNameRemap["TOKEN_VARYING"] = "\'varying\'";
tokenNameRemap["TOKEN_VOID"] = "\'void\'";
tokenNameRemap["TOKEN_WHILE"] = "\'while\'";
tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
tokenNameRemap["TOKEN_UINT64_CONSTANT"] = "unsigned int64 constant";
tokenNameRemap["TOKEN_INC_OP"] = "\'++\'";
tokenNameRemap["TOKEN_DEC_OP"] = "\'--\'";
tokenNameRemap["TOKEN_LEFT_OP"] = "\'<<\'";
tokenNameRemap["TOKEN_RIGHT_OP"] = "\'>>\'";
tokenNameRemap["TOKEN_LE_OP"] = "\'<=\'";
tokenNameRemap["TOKEN_GE_OP"] = "\'>=\'";
tokenNameRemap["TOKEN_EQ_OP"] = "\'==\'";
tokenNameRemap["TOKEN_NE_OP"] = "\'!=\'";
tokenNameRemap["TOKEN_AND_OP"] = "\'&&\'";
tokenNameRemap["TOKEN_OR_OP"] = "\'||\'";
tokenNameRemap["TOKEN_MUL_ASSIGN"] = "\'*=\'";
tokenNameRemap["TOKEN_DIV_ASSIGN"] = "\'/=\'";
tokenNameRemap["TOKEN_MOD_ASSIGN"] = "\'%=\'";
tokenNameRemap["TOKEN_ADD_ASSIGN"] = "\'+=\'";
tokenNameRemap["TOKEN_SUB_ASSIGN"] = "\'-=\'";
tokenNameRemap["TOKEN_LEFT_ASSIGN"] = "\'<<=\'";
tokenNameRemap["TOKEN_RIGHT_ASSIGN"] = "\'>>=\'";
tokenNameRemap["TOKEN_AND_ASSIGN"] = "\'&=\'";
tokenNameRemap["TOKEN_XOR_ASSIGN"] = "\'^=\'";
tokenNameRemap["TOKEN_OR_ASSIGN"] = "\'|=\'";
tokenNameRemap["TOKEN_PTR_OP"] = "\'->\'";
tokenNameRemap["$end"] = "end of file";
}
inline int ispcRand() {
#ifdef ISPC_IS_WINDOWS
return rand();
#else
return lrand48();
#endif
}
#define RT \
if (g->enableFuzzTest) { \
int r = ispcRand() % 40; \
if (r == 0) { \
Warning(yylloc, "Fuzz test dropping token"); \
} \
else if (r == 1) { \
Assert (tokenToName.size() > 0); \
int nt = sizeof(allTokens) / sizeof(allTokens[0]); \
int tn = ispcRand() % nt; \
yylval.stringVal = new std::string(yytext); /* just in case */\
Warning(yylloc, "Fuzz test replaced token with \"%s\"", tokenToName[allTokens[tn]].c_str()); \
return allTokens[tn]; \
} \
else if (r == 2) { \
Symbol *sym = m->symbolTable->RandomSymbol(); \
if (sym != NULL) { \
yylval.stringVal = new std::string(sym->name); \
Warning(yylloc, "Fuzz test replaced with identifier \"%s\".", sym->name.c_str()); \
return TOKEN_IDENTIFIER; \
} \
} \
/* TOKEN_TYPE_NAME */ \
} else /* swallow semicolon */
%}
%option nounput
%option noyywrap
%option bison-bridge
%option bison-locations
%option nounistd
WHITESPACE [ \t\r]+
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
@@ -75,200 +338,167 @@ IDENT [a-zA-Z_][a-zA-Z_0-9]*
ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
%%
"/*" { lCComment(yylloc); }
"//" { lCppComment(yylloc); }
"/*" { lCComment(&yylloc); }
"//" { lCppComment(&yylloc); }
__assert { return TOKEN_ASSERT; }
bool { return TOKEN_BOOL; }
break { return TOKEN_BREAK; }
case { return TOKEN_CASE; }
cbreak { return TOKEN_CBREAK; }
ccontinue { return TOKEN_CCONTINUE; }
cdo { return TOKEN_CDO; }
cfor { return TOKEN_CFOR; }
cif { return TOKEN_CIF; }
cwhile { return TOKEN_CWHILE; }
const { return TOKEN_CONST; }
continue { return TOKEN_CONTINUE; }
creturn { return TOKEN_CRETURN; }
default { return TOKEN_DEFAULT; }
do { return TOKEN_DO; }
delete { return TOKEN_DELETE; }
delete\[\] { return TOKEN_DELETE; }
double { return TOKEN_DOUBLE; }
else { return TOKEN_ELSE; }
enum { return TOKEN_ENUM; }
export { return TOKEN_EXPORT; }
extern { return TOKEN_EXTERN; }
false { return TOKEN_FALSE; }
float { return TOKEN_FLOAT; }
for { return TOKEN_FOR; }
foreach { return TOKEN_FOREACH; }
foreach_tiled { return TOKEN_FOREACH_TILED; }
goto { return TOKEN_GOTO; }
if { return TOKEN_IF; }
inline { return TOKEN_INLINE; }
int { return TOKEN_INT; }
int8 { return TOKEN_INT8; }
int16 { return TOKEN_INT16; }
int32 { return TOKEN_INT; }
int64 { return TOKEN_INT64; }
launch { return TOKEN_LAUNCH; }
new { return TOKEN_NEW; }
NULL { return TOKEN_NULL; }
print { return TOKEN_PRINT; }
reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
"please use C++-style '&' syntax for references "
"instead."); }
return { return TOKEN_RETURN; }
soa { return TOKEN_SOA; }
signed { return TOKEN_SIGNED; }
sizeof { return TOKEN_SIZEOF; }
static { return TOKEN_STATIC; }
struct { return TOKEN_STRUCT; }
switch { return TOKEN_SWITCH; }
sync { return TOKEN_SYNC; }
task { return TOKEN_TASK; }
true { return TOKEN_TRUE; }
typedef { return TOKEN_TYPEDEF; }
uniform { return TOKEN_UNIFORM; }
unsigned { return TOKEN_UNSIGNED; }
varying { return TOKEN_VARYING; }
void { return TOKEN_VOID; }
while { return TOKEN_WHILE; }
\"C\" { return TOKEN_STRING_C_LITERAL; }
\.\.\. { return TOKEN_DOTDOTDOT; }
__assert { RT; return TOKEN_ASSERT; }
bool { RT; return TOKEN_BOOL; }
break { RT; return TOKEN_BREAK; }
case { RT; return TOKEN_CASE; }
cbreak { RT; Warning(yylloc, "\"cbreak\" is deprecated. Use \"break\"."); return TOKEN_BREAK; }
ccontinue { RT; Warning(yylloc, "\"ccontinue\" is deprecated. Use \"continue\"."); return TOKEN_CONTINUE; }
cdo { RT; return TOKEN_CDO; }
cfor { RT; return TOKEN_CFOR; }
cif { RT; return TOKEN_CIF; }
cwhile { RT; return TOKEN_CWHILE; }
const { RT; return TOKEN_CONST; }
continue { RT; return TOKEN_CONTINUE; }
creturn { RT; Warning(yylloc, "\"creturn\" is deprecated. Use \"return\"."); return TOKEN_RETURN; }
__declspec { RT; return TOKEN_DECLSPEC; }
default { RT; return TOKEN_DEFAULT; }
do { RT; return TOKEN_DO; }
delete { RT; return TOKEN_DELETE; }
delete\[\] { RT; return TOKEN_DELETE; }
double { RT; return TOKEN_DOUBLE; }
else { RT; return TOKEN_ELSE; }
enum { RT; return TOKEN_ENUM; }
export { RT; return TOKEN_EXPORT; }
extern { RT; return TOKEN_EXTERN; }
false { RT; return TOKEN_FALSE; }
float { RT; return TOKEN_FLOAT; }
for { RT; return TOKEN_FOR; }
foreach { RT; return TOKEN_FOREACH; }
foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
foreach_unique { RT; return TOKEN_FOREACH_UNIQUE; }
goto { RT; return TOKEN_GOTO; }
if { RT; return TOKEN_IF; }
in { RT; return TOKEN_IN; }
inline { RT; return TOKEN_INLINE; }
int { RT; return TOKEN_INT; }
int8 { RT; return TOKEN_INT8; }
int16 { RT; return TOKEN_INT16; }
int32 { RT; return TOKEN_INT; }
int64 { RT; return TOKEN_INT64; }
launch { RT; return TOKEN_LAUNCH; }
new { RT; return TOKEN_NEW; }
NULL { RT; return TOKEN_NULL; }
print { RT; return TOKEN_PRINT; }
return { RT; return TOKEN_RETURN; }
soa { RT; return TOKEN_SOA; }
signed { RT; return TOKEN_SIGNED; }
sizeof { RT; return TOKEN_SIZEOF; }
static { RT; return TOKEN_STATIC; }
struct { RT; return TOKEN_STRUCT; }
switch { RT; return TOKEN_SWITCH; }
sync { RT; return TOKEN_SYNC; }
task { RT; return TOKEN_TASK; }
true { RT; return TOKEN_TRUE; }
typedef { RT; return TOKEN_TYPEDEF; }
uniform { RT; return TOKEN_UNIFORM; }
unmasked { RT; return TOKEN_UNMASKED; }
unsigned { RT; return TOKEN_UNSIGNED; }
varying { RT; return TOKEN_VARYING; }
void { RT; return TOKEN_VOID; }
while { RT; return TOKEN_WHILE; }
\"C\" { RT; return TOKEN_STRING_C_LITERAL; }
\.\.\. { RT; return TOKEN_DOTDOTDOT; }
L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; }
{IDENT} {
RT;
/* We have an identifier--is it a type name or an identifier?
The symbol table will straighten us out... */
yylval->stringVal = new std::string(yytext);
yylval.stringVal = new std::string(yytext);
if (m->symbolTable->LookupType(yytext) != NULL)
return TOKEN_TYPE_NAME;
else
return TOKEN_IDENTIFIER;
}
{INT_NUMBER}+(u|U|l|L)*? {
int ls = 0, us = 0;
{INT_NUMBER} {
RT;
return lParseInteger(false);
}
char *endPtr = NULL;
if (yytext[0] == '0' && yytext[1] == 'b')
yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
else {
#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
yylval->intVal = _strtoui64(yytext, &endPtr, 0);
#else
// FIXME: should use strtouq and then issue an error if we can't
// fit into 64 bits...
yylval->intVal = strtoull(yytext, &endPtr, 0);
#endif
}
bool kilo = false, mega = false, giga = false;
for (; *endPtr; endPtr++) {
if (*endPtr == 'k')
kilo = true;
else if (*endPtr == 'M')
mega = true;
else if (*endPtr == 'G')
giga = true;
else if (*endPtr == 'l' || *endPtr == 'L')
ls++;
else if (*endPtr == 'u' || *endPtr == 'U')
us++;
}
if (kilo)
yylval->intVal *= 1024;
if (mega)
yylval->intVal *= 1024*1024;
if (giga)
yylval->intVal *= 1024*1024*1024;
if (ls >= 2)
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
else if (ls == 1)
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
// See if we can fit this into a 32-bit integer...
if ((yylval->intVal & 0xffffffff) == yylval->intVal)
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
else
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
{INT_NUMBER_DOTDOTDOT} {
RT;
return lParseInteger(true);
}
{FLOAT_NUMBER} {
yylval->floatVal = (float)atof(yytext);
RT;
yylval.floatVal = (float)atof(yytext);
return TOKEN_FLOAT_CONSTANT;
}
{HEX_FLOAT_NUMBER} {
yylval->floatVal = (float)lParseHexFloat(yytext);
RT;
yylval.floatVal = (float)lParseHexFloat(yytext);
return TOKEN_FLOAT_CONSTANT;
}
"++" { return TOKEN_INC_OP; }
"--" { return TOKEN_DEC_OP; }
"<<" { return TOKEN_LEFT_OP; }
">>" { return TOKEN_RIGHT_OP; }
"<=" { return TOKEN_LE_OP; }
">=" { return TOKEN_GE_OP; }
"==" { return TOKEN_EQ_OP; }
"!=" { return TOKEN_NE_OP; }
"&&" { return TOKEN_AND_OP; }
"||" { return TOKEN_OR_OP; }
"*=" { return TOKEN_MUL_ASSIGN; }
"/=" { return TOKEN_DIV_ASSIGN; }
"%=" { return TOKEN_MOD_ASSIGN; }
"+=" { return TOKEN_ADD_ASSIGN; }
"-=" { return TOKEN_SUB_ASSIGN; }
"<<=" { return TOKEN_LEFT_ASSIGN; }
">>=" { return TOKEN_RIGHT_ASSIGN; }
"&=" { return TOKEN_AND_ASSIGN; }
"^=" { return TOKEN_XOR_ASSIGN; }
"|=" { return TOKEN_OR_ASSIGN; }
"->" { return TOKEN_PTR_OP; }
";" { return ';'; }
("{"|"<%") { return '{'; }
("}"|"%>") { return '}'; }
"," { return ','; }
":" { return ':'; }
"=" { return '='; }
"(" { return '('; }
")" { return ')'; }
("["|"<:") { return '['; }
("]"|":>") { return ']'; }
"." { return '.'; }
"&" { return '&'; }
"!" { return '!'; }
"~" { return '~'; }
"-" { return '-'; }
"+" { return '+'; }
"*" { return '*'; }
"/" { return '/'; }
"%" { return '%'; }
"<" { return '<'; }
">" { return '>'; }
"^" { return '^'; }
"|" { return '|'; }
"?" { return '?'; }
"++" { RT; return TOKEN_INC_OP; }
"--" { RT; return TOKEN_DEC_OP; }
"<<" { RT; return TOKEN_LEFT_OP; }
">>" { RT; return TOKEN_RIGHT_OP; }
"<=" { RT; return TOKEN_LE_OP; }
">=" { RT; return TOKEN_GE_OP; }
"==" { RT; return TOKEN_EQ_OP; }
"!=" { RT; return TOKEN_NE_OP; }
"&&" { RT; return TOKEN_AND_OP; }
"||" { RT; return TOKEN_OR_OP; }
"*=" { RT; return TOKEN_MUL_ASSIGN; }
"/=" { RT; return TOKEN_DIV_ASSIGN; }
"%=" { RT; return TOKEN_MOD_ASSIGN; }
"+=" { RT; return TOKEN_ADD_ASSIGN; }
"-=" { RT; return TOKEN_SUB_ASSIGN; }
"<<=" { RT; return TOKEN_LEFT_ASSIGN; }
">>=" { RT; return TOKEN_RIGHT_ASSIGN; }
"&=" { RT; return TOKEN_AND_ASSIGN; }
"^=" { RT; return TOKEN_XOR_ASSIGN; }
"|=" { RT; return TOKEN_OR_ASSIGN; }
"->" { RT; return TOKEN_PTR_OP; }
";" { RT; return ';'; }
("{"|"<%") { RT; return '{'; }
("}"|"%>") { RT; return '}'; }
"," { RT; return ','; }
":" { RT; return ':'; }
"=" { RT; return '='; }
"(" { RT; return '('; }
")" { RT; return ')'; }
("["|"<:") { RT; return '['; }
("]"|":>") { RT; return ']'; }
"." { RT; return '.'; }
"&" { RT; return '&'; }
"!" { RT; return '!'; }
"~" { RT; return '~'; }
"-" { RT; return '-'; }
"+" { RT; return '+'; }
"*" { RT; return '*'; }
"/" { RT; return '/'; }
"%" { RT; return '%'; }
"<" { RT; return '<'; }
">" { RT; return '>'; }
"^" { RT; return '^'; }
"|" { RT; return '|'; }
"?" { RT; return '?'; }
{WHITESPACE} { }
\n {
yylloc->last_line++;
yylloc->last_column = 1;
yylloc.last_line++;
yylloc.last_column = 1;
}
#(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* {
lHandleCppHash(yylloc);
lHandleCppHash(&yylloc);
}
. {
Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
Error(yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
YY_USER_ACTION
}
@@ -304,13 +534,94 @@ lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
}
static int
lParseInteger(bool dotdotdot) {
int ls = 0, us = 0;
char *endPtr = NULL;
if (yytext[0] == '0' && yytext[1] == 'b')
yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
else {
#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
yylval.intVal = _strtoui64(yytext, &endPtr, 0);
#else
// FIXME: should use strtouq and then issue an error if we can't
// fit into 64 bits...
yylval.intVal = strtoull(yytext, &endPtr, 0);
#endif
}
bool kilo = false, mega = false, giga = false;
for (; *endPtr; endPtr++) {
if (*endPtr == 'k')
kilo = true;
else if (*endPtr == 'M')
mega = true;
else if (*endPtr == 'G')
giga = true;
else if (*endPtr == 'l' || *endPtr == 'L')
ls++;
else if (*endPtr == 'u' || *endPtr == 'U')
us++;
else
Assert(dotdotdot && *endPtr == '.');
}
if (kilo)
yylval.intVal *= 1024;
if (mega)
yylval.intVal *= 1024*1024;
if (giga)
yylval.intVal *= 1024*1024*1024;
if (dotdotdot) {
if (ls >= 2)
return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
else if (ls == 1)
return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
// See if we can fit this into a 32-bit integer...
if ((yylval.intVal & 0xffffffff) == yylval.intVal)
return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
else
return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
}
else {
if (ls >= 2)
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
else if (ls == 1)
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
else if (us) {
// u suffix only
if (yylval.intVal <= 0xffffffffL)
return TOKEN_UINT32_CONSTANT;
else
return TOKEN_UINT64_CONSTANT;
}
else {
// No u or l suffix
// First, see if we can fit this into a 32-bit integer...
if (yylval.intVal <= 0x7fffffffULL)
return TOKEN_INT32_CONSTANT;
else if (yylval.intVal <= 0xffffffffULL)
return TOKEN_UINT32_CONSTANT;
else if (yylval.intVal <= 0x7fffffffffffffffULL)
return TOKEN_INT64_CONSTANT;
else
return TOKEN_UINT64_CONSTANT;
}
}
}
/** Handle a C-style comment in the source.
*/
static void
lCComment(SourcePos *pos) {
char c, prev = 0;
while ((c = yyinput()) != 0) {
++pos->last_column;
if (c == '\n') {
pos->last_line++;
pos->last_column = 1;
@@ -373,6 +684,7 @@ static void lHandleCppHash(SourcePos *pos) {
++src;
}
pos->name = strdup(filename.c_str());
RegisterDependency(filename);
}
@@ -415,7 +727,7 @@ lEscapeChar(char *str, char *pChar, SourcePos *pos)
str = tail - 1;
break;
default:
Error(*pos, "Bad character escape sequence: '%s'\n.", str);
Error(*pos, "Bad character escape sequence: '%s'.", str);
break;
}
}
@@ -435,7 +747,7 @@ lStringConst(YYSTYPE *yylval, SourcePos *pos)
std::string str;
p = strchr(yytext, '"') + 1;
while (*p != '\"') {
char cval;
char cval = '\0';
p = lEscapeChar(p, &cval, pos);
str.push_back(cval);
}

Some files were not shown because too many files have changed in this diff Show More