Compare commits
563 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8f902fde9c | ||
|
|
b6023c517e | ||
|
|
42d77e9191 | ||
|
|
312a7582df | ||
|
|
dc939eba78 | ||
|
|
f8bec51de2 | ||
|
|
0bf1320a32 | ||
|
|
81dbd504aa | ||
|
|
63dd7d9859 | ||
|
|
2063d34f3e | ||
|
|
83fdc2e5ad | ||
|
|
6ba7368ab0 | ||
|
|
c2805942a9 | ||
|
|
9892c8bf9a | ||
|
|
23e5877509 | ||
|
|
8cbfde6092 | ||
|
|
24087ff3cc | ||
|
|
6827001c1d | ||
|
|
16b0806d40 | ||
|
|
2129b1e27d | ||
|
|
a267762f59 | ||
|
|
65ca795030 | ||
|
|
e82b649ec0 | ||
|
|
275cdb1713 | ||
|
|
d3b86dcc90 | ||
|
|
c736b75075 | ||
|
|
b601331362 | ||
|
|
32d44a5b9e | ||
|
|
810784da1f | ||
|
|
d517b37f3f | ||
|
|
adeef0af01 | ||
|
|
97ddc1ed10 | ||
|
|
bf580648a1 | ||
|
|
ecc54fa0eb | ||
|
|
04d32ae3e6 | ||
|
|
a18b3ae88e | ||
|
|
e57801a5d1 | ||
|
|
da4390aede | ||
|
|
9e85667219 | ||
|
|
b80867d473 | ||
|
|
d742dcce59 | ||
|
|
7a7af3d5f9 | ||
|
|
e323b1d0ad | ||
|
|
3c18c7a713 | ||
|
|
7c16292cb7 | ||
|
|
d665e2e85b | ||
|
|
172a189c6f | ||
|
|
406fbab40e | ||
|
|
09dc217f8c | ||
|
|
9002837750 | ||
|
|
411d5b44ef | ||
|
|
360cc8044e | ||
|
|
ec2e9b5e79 | ||
|
|
881dba61e4 | ||
|
|
6412876f64 | ||
|
|
538d51cbfe | ||
|
|
3dd9ff3d84 | ||
|
|
7f386923b0 | ||
|
|
d2312b1fbd | ||
|
|
6655373ac3 | ||
|
|
d492af7bc0 | ||
|
|
230a7b7374 | ||
|
|
4204a752f7 | ||
|
|
0e88d5f97f | ||
|
|
a13e7f2435 | ||
|
|
be2108260e | ||
|
|
59b0a2b208 | ||
|
|
05a5a42a08 | ||
|
|
f0b0618484 | ||
|
|
4ecdbe4bd9 | ||
|
|
9e9f266e52 | ||
|
|
0ce67f37ac | ||
|
|
ddcd0a49ec | ||
|
|
63b8fac852 | ||
|
|
def8d7850b | ||
|
|
0442efc856 | ||
|
|
f928bbb53c | ||
|
|
1ab7500dbb | ||
|
|
c58d92d46b | ||
|
|
8276e912fd | ||
|
|
e0490d0df5 | ||
|
|
11db466a88 | ||
|
|
caaee0b666 | ||
|
|
f2f470f369 | ||
|
|
09bb36f58c | ||
|
|
21719df6fd | ||
|
|
39329809dd | ||
|
|
44797e2925 | ||
|
|
c8f373d119 | ||
|
|
8a22c63889 | ||
|
|
1a4434d314 | ||
|
|
165a13b13e | ||
|
|
43364b2d69 | ||
|
|
6eaecd20d5 | ||
|
|
c80bfeacf6 | ||
|
|
2a19cc1758 | ||
|
|
8f5189f606 | ||
|
|
49dde7c6f2 | ||
|
|
765a0d8896 | ||
|
|
19d8f2e258 | ||
|
|
e6aec96e05 | ||
|
|
a2d42c3242 | ||
|
|
52836aae87 | ||
|
|
bda566d6a7 | ||
|
|
63ed90b0fd | ||
|
|
0bb4d282e2 | ||
|
|
ae89a65dad | ||
|
|
e9fe9f5043 | ||
|
|
ce8dc5927c | ||
|
|
f6989cce38 | ||
|
|
6dbbf9aa80 | ||
|
|
fe6282e837 | ||
|
|
51210a869b | ||
|
|
658652a9ff | ||
|
|
aecd6e0878 | ||
|
|
1334a84861 | ||
|
|
6a410fc30e | ||
|
|
984a68c3a9 | ||
|
|
daf5aa8e8b | ||
|
|
98b2e0e426 | ||
|
|
9a1932eaf7 | ||
|
|
371d4be8ef | ||
|
|
d180031ef0 | ||
|
|
e09e953bbb | ||
|
|
2c640f7e52 | ||
|
|
2bacebb1fb | ||
|
|
df18b2a150 | ||
|
|
216ac4b1a4 | ||
|
|
898cded646 | ||
|
|
c09c87873e | ||
|
|
10b79fb41b | ||
|
|
ec0280be11 | ||
|
|
8e19d54e75 | ||
|
|
3c070e5e20 | ||
|
|
dde599f48f | ||
|
|
cc15ecfb3a | ||
|
|
7a7c54bd59 | ||
|
|
bea88ab122 | ||
|
|
926b3b9ee3 | ||
|
|
bc7775aef2 | ||
|
|
107669686c | ||
|
|
bb11b3ab66 | ||
|
|
516ba85abd | ||
|
|
098277b4f0 | ||
|
|
950a989744 | ||
|
|
fb8b893b10 | ||
|
|
9ca80debb8 | ||
|
|
080241b7d1 | ||
|
|
0d534720bb | ||
|
|
1dc4424a30 | ||
|
|
57f0cf30c0 | ||
|
|
8ef6bc1636 | ||
|
|
974b40c8af | ||
|
|
45e9e0be0b | ||
|
|
ec0918045d | ||
|
|
38bcecd2f3 | ||
|
|
aabbdba068 | ||
|
|
84c183da1f | ||
|
|
b363b98211 | ||
|
|
8defbeb248 | ||
|
|
f52d227d80 | ||
|
|
78cb45fb25 | ||
|
|
2d8026625b | ||
|
|
73afab464f | ||
|
|
8aa139b6be | ||
|
|
e5fe0eabdc | ||
|
|
0d3993fa25 | ||
|
|
ac421f68e2 | ||
|
|
b9d1f0db18 | ||
|
|
6aad4c7a39 | ||
|
|
4186ef204d | ||
|
|
ae7a094ee0 | ||
|
|
3a007f939a | ||
|
|
b8503b9255 | ||
|
|
b7bc76d3cc | ||
|
|
27d6c12972 | ||
|
|
b69d783e09 | ||
|
|
3b2ff6301c | ||
|
|
6c7043916e | ||
|
|
96a6e75b71 | ||
|
|
a91e4e7981 | ||
|
|
95d8f76ec3 | ||
|
|
66d4c2ddd9 | ||
|
|
8115ca739a | ||
|
|
ec4021bbf4 | ||
|
|
e431b07e04 | ||
|
|
d34a87404d | ||
|
|
f38770bf2a | ||
|
|
dc9998ccaf | ||
|
|
f1b3703389 | ||
|
|
b6a8d0ee7f | ||
|
|
2a4dff38d0 | ||
|
|
665c564dcf | ||
|
|
ed71413e04 | ||
|
|
4b5e49b00b | ||
|
|
f558ee788e | ||
|
|
ceb8ca680c | ||
|
|
79ebcbec4b | ||
|
|
2c7b650240 | ||
|
|
54459255d4 | ||
|
|
b4a078e2f6 | ||
|
|
ed13dd066b | ||
|
|
2b4a3b22bf | ||
|
|
8b891da628 | ||
|
|
5a2c8342eb | ||
|
|
50eb4bf53a | ||
|
|
3c10ddd46a | ||
|
|
0b7f9acc70 | ||
|
|
10fbaec247 | ||
|
|
007a734595 | ||
|
|
46716aada3 | ||
|
|
3bc66136b2 | ||
|
|
fae47e0dfc | ||
|
|
bd52e86486 | ||
|
|
b2f6ed7209 | ||
|
|
4b334fd2e2 | ||
|
|
a23a7006e3 | ||
|
|
f47171a17c | ||
|
|
4945dc3682 | ||
|
|
ada66b5313 | ||
|
|
96450e17a3 | ||
|
|
40a295e951 | ||
|
|
d6c6f95373 | ||
|
|
19b46be20d | ||
|
|
789e04ce90 | ||
|
|
dd4f0a600b | ||
|
|
6c7df4cb6b | ||
|
|
79e0a9f32a | ||
|
|
6c9bc63a1c | ||
|
|
28a821df7d | ||
|
|
27e39954d6 | ||
|
|
e730a5364b | ||
|
|
92b3ae41dd | ||
|
|
89a2566e01 | ||
|
|
1ac3e03171 | ||
|
|
b86d40091a | ||
|
|
91d22d150f | ||
|
|
1d29991268 | ||
|
|
6f0a2686dc | ||
|
|
f06caabb07 | ||
|
|
3c869802fb | ||
|
|
7b6bd90903 | ||
|
|
967bfa9c92 | ||
|
|
592affb984 | ||
|
|
96aaf6d53b | ||
|
|
1397dbdabc | ||
|
|
6118643232 | ||
|
|
71198a0b54 | ||
|
|
22cb80399f | ||
|
|
fa1fd8a576 | ||
|
|
6df7d31a5b | ||
|
|
ef049e92ef | ||
|
|
fe8b109ca5 | ||
|
|
8fd9b84a80 | ||
|
|
5cb53f52c3 | ||
|
|
d86653668e | ||
|
|
5084712a15 | ||
|
|
ece65cab18 | ||
|
|
1f6075506c | ||
|
|
51ade48e3d | ||
|
|
21c43737fe | ||
|
|
6c7bcf00e7 | ||
|
|
7a2142075c | ||
|
|
e8e9baa417 | ||
|
|
449d956966 | ||
|
|
90db01d038 | ||
|
|
38cea6dc71 | ||
|
|
64807dfb3b | ||
|
|
d943455e10 | ||
|
|
fd03ba7586 | ||
|
|
2c5a57e386 | ||
|
|
e8858150cb | ||
|
|
333f901187 | ||
|
|
7dd4d6c75e | ||
|
|
99f57cfda6 | ||
|
|
4d1eb94dfd | ||
|
|
22d584f302 | ||
|
|
72c41f104e | ||
|
|
8d3ac3ac1e | ||
|
|
299ae186f1 | ||
|
|
f4df2fb176 | ||
|
|
625fbef613 | ||
|
|
fbed0ac56b | ||
|
|
dc120f3962 | ||
|
|
4f053e5b83 | ||
|
|
c6241581a0 | ||
|
|
041ade66d5 | ||
|
|
067a2949ba | ||
|
|
55c754750e | ||
|
|
72b6c12856 | ||
|
|
15ea0af687 | ||
|
|
ee7e367981 | ||
|
|
8006589828 | ||
|
|
413264eaae | ||
|
|
7db8824da2 | ||
|
|
e1bc010bd1 | ||
|
|
bff02017da | ||
|
|
c0019bd8e5 | ||
|
|
e495ef2c48 | ||
|
|
78d62705cc | ||
|
|
2791bd0015 | ||
|
|
7cf66eb61f | ||
|
|
944c53bff1 | ||
|
|
c756c855ea | ||
|
|
58bb2826b2 | ||
|
|
b7bef87a4d | ||
|
|
0c1b206185 | ||
|
|
7d7e99a92c | ||
|
|
1ba8d7ef74 | ||
|
|
d99bd279e8 | ||
|
|
ee1fe3aa9f | ||
|
|
c4b1d79c5c | ||
|
|
a1a43cdfe0 | ||
|
|
27b62781cc | ||
|
|
0c5d7ff8f2 | ||
|
|
0e2b315ded | ||
|
|
3e74d1c544 | ||
|
|
da690acce5 | ||
|
|
0baa2b484d | ||
|
|
260d7298c3 | ||
|
|
d5cc2ad643 | ||
|
|
12706cd37f | ||
|
|
7167442d6e | ||
|
|
8547101c4b | ||
|
|
5d58a9e4c2 | ||
|
|
cd98a29a4b | ||
|
|
903714fd40 | ||
|
|
138c7acf22 | ||
|
|
03b2b8ae8f | ||
|
|
016b502d46 | ||
|
|
c5f6653564 | ||
|
|
cf9a4e209e | ||
|
|
040421942f | ||
|
|
4dfc596d38 | ||
|
|
fe83ef7635 | ||
|
|
db8b08131f | ||
|
|
32815e628d | ||
|
|
71bdc67a45 | ||
|
|
cb9f50ef63 | ||
|
|
12c754c92b | ||
|
|
e4b3d03da5 | ||
|
|
cc26b66e99 | ||
|
|
34d81fa522 | ||
|
|
49f1a5c2b3 | ||
|
|
326c45fa17 | ||
|
|
a2bb899a6b | ||
|
|
9fedb1674e | ||
|
|
7c91b01125 | ||
|
|
c202e9e106 | ||
|
|
645a8c9349 | ||
|
|
093fdcf3df | ||
|
|
7abda5e8c2 | ||
|
|
abf7c423bb | ||
|
|
55d5c07d00 | ||
|
|
0a9b272fe4 | ||
|
|
b9d6ba2aa0 | ||
|
|
a0c9f7823b | ||
|
|
4477a9c59a | ||
|
|
99a27fe241 | ||
|
|
fefa86e0cf | ||
|
|
098c4910de | ||
|
|
17b7148300 | ||
|
|
f4a2ef28e3 | ||
|
|
f0d013ee76 | ||
|
|
5ece6fec04 | ||
|
|
d88dbf3612 | ||
|
|
2a18efef82 | ||
|
|
fd846fbe77 | ||
|
|
ca7cc4744e | ||
|
|
491fa239bd | ||
|
|
66765dc123 | ||
|
|
70a5348f43 | ||
|
|
2aa61007c6 | ||
|
|
acfbe77ffc | ||
|
|
08696653ca | ||
|
|
8a1a214ca9 | ||
|
|
7aaeb27e0f | ||
|
|
972043c146 | ||
|
|
8475dc082a | ||
|
|
d0e583b29c | ||
|
|
c8feee238b | ||
|
|
6712ecd928 | ||
|
|
d0c7b5d35c | ||
|
|
802add1f97 | ||
|
|
95556811fa | ||
|
|
581472564d | ||
|
|
c7dc8862a5 | ||
|
|
4f8cf019ca | ||
|
|
4c9ac7fcf1 | ||
|
|
1dac05960a | ||
|
|
c27418da77 | ||
|
|
637d076e99 | ||
|
|
391678a5b3 | ||
|
|
4cd0cf1650 | ||
|
|
b813452d33 | ||
|
|
eb85da81e1 | ||
|
|
920cf63201 | ||
|
|
dc09d46bf4 | ||
|
|
05d1b06eeb | ||
|
|
c1661eb06b | ||
|
|
e9626a1d10 | ||
|
|
560bf5ca09 | ||
|
|
512f8d8b60 | ||
|
|
87c8a89349 | ||
|
|
255791f18e | ||
|
|
d5e3416e8e | ||
|
|
5b2d43f665 | ||
|
|
540fc6c2f3 | ||
|
|
b3c5043dcc | ||
|
|
d0d9aae968 | ||
|
|
3270e2bf5a | ||
|
|
013a3e7567 | ||
|
|
8368ba8539 | ||
|
|
ca0310e335 | ||
|
|
4690a678c1 | ||
|
|
f8a39402a2 | ||
|
|
b923e4daea | ||
|
|
247775d1ec | ||
|
|
6e9fea377d | ||
|
|
ca5c65d032 | ||
|
|
f9dc621ebe | ||
|
|
ffe484c31e | ||
|
|
62cd3418ca | ||
|
|
d8a8f3a996 | ||
|
|
0ad8dbbfc9 | ||
|
|
e15a1946c6 | ||
|
|
8878826661 | ||
|
|
95a8b6e5e8 | ||
|
|
388d0d2cfd | ||
|
|
d3a374e71c | ||
|
|
1da2834b1e | ||
|
|
ca3100874f | ||
|
|
117f48a331 | ||
|
|
89bbceefee | ||
|
|
7e18f0e247 | ||
|
|
29c2f24faf | ||
|
|
3bb2dee275 | ||
|
|
88cd5584e8 | ||
|
|
41f9ce2560 | ||
|
|
20044f5749 | ||
|
|
833f0a6aa7 | ||
|
|
10c5ba140c | ||
|
|
316de0b880 | ||
|
|
989966f81b | ||
|
|
ccd550dc52 | ||
|
|
ddf350839a | ||
|
|
6a7dd2787a | ||
|
|
385771e73e | ||
|
|
349ab0b9c5 | ||
|
|
b5e6c6a2f3 | ||
|
|
2832ea641f | ||
|
|
cb7edf2725 | ||
|
|
f1f1be2822 | ||
|
|
7dffd65609 | ||
|
|
2c8a44e28b | ||
|
|
39bb95a6ee | ||
|
|
da9dba80a0 | ||
|
|
12f3285f9b | ||
|
|
7e954e4248 | ||
|
|
d74cc6397b | ||
|
|
777343331e | ||
|
|
a062653743 | ||
|
|
57af0eb64f | ||
|
|
60aae16752 | ||
|
|
e264d95019 | ||
|
|
0664f5a724 | ||
|
|
17c6a19527 | ||
|
|
cbc8b8259b | ||
|
|
1067a2e4be | ||
|
|
74a031a759 | ||
|
|
ee437193fb | ||
|
|
436c53037e | ||
|
|
f55ba9d3cb | ||
|
|
8adb99b768 | ||
|
|
13c42412d2 | ||
|
|
75507d8b35 | ||
|
|
ddfe4932ac | ||
|
|
cf208cc2e3 | ||
|
|
28ac016928 | ||
|
|
f4ae41d006 | ||
|
|
9ec8e5a275 | ||
|
|
a473046058 | ||
|
|
a69b7a5a01 | ||
|
|
640918bcc0 | ||
|
|
f39fbdb3fc | ||
|
|
50d4d81062 | ||
|
|
3b95452481 | ||
|
|
c152ae3c32 | ||
|
|
f6cbaa78e8 | ||
|
|
7adb250b59 | ||
|
|
db5db5aefd | ||
|
|
8fdf84de04 | ||
|
|
ff5cbe80d1 | ||
|
|
e013e0a374 | ||
|
|
b7df312ca7 | ||
|
|
ce82c3c0ae | ||
|
|
2f958cfbda | ||
|
|
8ef41dfd97 | ||
|
|
3082ea4765 | ||
|
|
e482d29951 | ||
|
|
ff48dd7bfb | ||
|
|
7bf9c11822 | ||
|
|
f7937f1e4b | ||
|
|
0115eeabfe | ||
|
|
4b9c3ec0da | ||
|
|
55b81e35a7 | ||
|
|
2a1c7f2d47 | ||
|
|
8603f9838f | ||
|
|
95224f3f11 | ||
|
|
f81acbfe80 | ||
|
|
6d7ff7eba2 | ||
|
|
ad429db7e8 | ||
|
|
4c07abbaf4 | ||
|
|
e3c0551129 | ||
|
|
8971baa42b | ||
|
|
317a1f51f7 | ||
|
|
c63d139482 | ||
|
|
9e682362e9 | ||
|
|
56ec939692 | ||
|
|
a86b942730 | ||
|
|
52eb4c6014 | ||
|
|
f4adbbf90c | ||
|
|
cc86e4a7d2 | ||
|
|
e864447e4a | ||
|
|
73bf552cd6 | ||
|
|
f20a2d2ee9 | ||
|
|
0c25bc063c | ||
|
|
db72781d2a | ||
|
|
0c8ad09040 | ||
|
|
49880ab761 | ||
|
|
fe2d9aa600 | ||
|
|
1dead425e4 | ||
|
|
adb1e47a59 | ||
|
|
ffba8580c1 | ||
|
|
ea18427d29 | ||
|
|
97d42f5c53 | ||
|
|
f3089df086 | ||
|
|
157e7c97ae | ||
|
|
bb8e13e3c9 | ||
|
|
5b4673e8eb | ||
|
|
5b9de8cc07 | ||
|
|
33ea934c8f | ||
|
|
6b3e14b0a4 | ||
|
|
098ceb5567 | ||
|
|
8e2b0632e8 | ||
|
|
420d373d89 | ||
|
|
a59fd7eeb3 | ||
|
|
ee91fa1228 | ||
|
|
a2b5ce0172 | ||
|
|
3efbc71a01 | ||
|
|
b7c5af7e64 | ||
|
|
f939015b97 | ||
|
|
a9ed71f553 | ||
|
|
96a429694f | ||
|
|
fddc5e022e | ||
|
|
2236d53def | ||
|
|
4e018d0a20 | ||
|
|
977b983771 | ||
|
|
fa7a7fe23e | ||
|
|
724a843bbd | ||
|
|
0db752f3a2 | ||
|
|
ee8b6ebbf6 | ||
|
|
f2b99ccb08 |
9
.gitignore
vendored
9
.gitignore
vendored
@@ -5,4 +5,11 @@ ispc
|
||||
ispc_test
|
||||
objs
|
||||
docs/doxygen
|
||||
docs/ispc.html
|
||||
docs/*.html
|
||||
tests*/*cpp
|
||||
tests*/*run
|
||||
examples/*/*.png
|
||||
examples/*/*.ppm
|
||||
examples/*/objs/*
|
||||
|
||||
|
||||
|
||||
64
Makefile
64
Makefile
@@ -2,6 +2,15 @@
|
||||
# ispc Makefile
|
||||
#
|
||||
|
||||
# If you have your own special version of llvm and/or clang, change
|
||||
# these variables to match.
|
||||
LLVM_CONFIG=$(shell which llvm-config)
|
||||
CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
|
||||
|
||||
# Add llvm bin to the path so any scripts run will go to the right llvm-config
|
||||
LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
|
||||
export PATH:=$(LLVM_BIN):$(PATH)
|
||||
|
||||
ARCH_OS = $(shell uname)
|
||||
ifeq ($(ARCH_OS), Darwin)
|
||||
ARCH_OS2 = "OSX"
|
||||
@@ -10,27 +19,17 @@ else
|
||||
endif
|
||||
ARCH_TYPE = $(shell arch)
|
||||
|
||||
ifeq ($(shell llvm-config --version), 3.1svn)
|
||||
LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker \
|
||||
-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo \
|
||||
-lLLVMBitWriter -lLLVMTableGen -lLLVMCBackendInfo \
|
||||
-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG \
|
||||
-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info \
|
||||
-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler -lLLVMMCParser \
|
||||
-lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMTransformUtils \
|
||||
-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld \
|
||||
-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore \
|
||||
-lLLVMSupport
|
||||
else
|
||||
LLVM_LIBS=$(shell llvm-config --libs)
|
||||
endif
|
||||
LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs)
|
||||
|
||||
CLANG=clang
|
||||
CLANG_LIBS = -lclangFrontend -lclangDriver \
|
||||
-lclangSerialization -lclangParse -lclangSema \
|
||||
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
|
||||
ifneq ($(shell $(LLVM_CONFIG) --version), 3.0)
|
||||
CLANG_LIBS += -lclangEdit
|
||||
endif
|
||||
|
||||
ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
|
||||
ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
|
||||
-lpthread
|
||||
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
@@ -41,8 +40,8 @@ ifeq ($(ARCH_OS2),Msys)
|
||||
ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
|
||||
endif
|
||||
|
||||
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
|
||||
LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
|
||||
LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
|
||||
LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
|
||||
LLVM_VERSION_DEF=-D$(LLVM_VERSION)
|
||||
|
||||
BUILD_DATE=$(shell date +%Y%m%d)
|
||||
@@ -50,15 +49,16 @@ BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
|
||||
|
||||
CXX=g++
|
||||
CPP=cpp
|
||||
OPT=-g3
|
||||
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
|
||||
OPT=-O2
|
||||
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \
|
||||
-Wall $(LLVM_VERSION_DEF) \
|
||||
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
|
||||
|
||||
LDFLAGS=
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
# try to link everything statically under Linux (including libstdc++) so
|
||||
# that the binaries we generate will be portable across distributions...
|
||||
LDFLAGS=-static
|
||||
# LDFLAGS=-static
|
||||
endif
|
||||
|
||||
LEX=flex
|
||||
@@ -71,8 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
|
||||
type.cpp util.cpp
|
||||
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
|
||||
generic-16 generic-1
|
||||
TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
|
||||
generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
|
||||
BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
|
||||
builtins/dispatch.ll
|
||||
BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
|
||||
@@ -86,10 +86,10 @@ OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
|
||||
|
||||
default: ispc
|
||||
|
||||
.PHONY: dirs clean depend doxygen print_llvm_src
|
||||
.PHONY: dirs clean depend doxygen print_llvm_src llvm_check
|
||||
.PRECIOUS: objs/builtins-%.cpp
|
||||
|
||||
depend: $(CXX_SRC) $(HEADERS)
|
||||
depend: llvm_check $(CXX_SRC) $(HEADERS)
|
||||
@echo Updating dependencies
|
||||
@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
|
||||
|
||||
@@ -99,7 +99,15 @@ dirs:
|
||||
@echo Creating objs/ directory
|
||||
@/bin/mkdir -p objs
|
||||
|
||||
print_llvm_src:
|
||||
llvm_check:
|
||||
@llvm-config --version > /dev/null || \
|
||||
(echo; \
|
||||
echo "******************************************"; \
|
||||
echo "ERROR: llvm-config not found in your PATH"; \
|
||||
echo "******************************************"; \
|
||||
echo; exit 1)
|
||||
|
||||
print_llvm_src: llvm_check
|
||||
@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
|
||||
|
||||
clean:
|
||||
@@ -111,7 +119,7 @@ doxygen:
|
||||
|
||||
ispc: print_llvm_src dirs $(OBJS)
|
||||
@echo Creating ispc executable
|
||||
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
|
||||
@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
@echo Compiling $<
|
||||
@@ -121,6 +129,10 @@ objs/cbackend.o: cbackend.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/opt.o: opt.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/%.o: objs/%.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
131
ast.cpp
131
ast.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -32,8 +32,10 @@
|
||||
*/
|
||||
|
||||
/** @file ast.cpp
|
||||
@brief
|
||||
*/
|
||||
|
||||
@brief General functionality related to abstract syntax trees and
|
||||
traversal of them.
|
||||
*/
|
||||
|
||||
#include "ast.h"
|
||||
#include "expr.h"
|
||||
@@ -53,10 +55,10 @@ ASTNode::~ASTNode() {
|
||||
// AST
|
||||
|
||||
void
|
||||
AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
|
||||
AST::AddFunction(Symbol *sym, Stmt *code) {
|
||||
if (sym == NULL)
|
||||
return;
|
||||
functions.push_back(new Function(sym, args, code));
|
||||
functions.push_back(new Function(sym, code));
|
||||
}
|
||||
|
||||
|
||||
@@ -90,6 +92,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
DoStmt *dos;
|
||||
ForStmt *fs;
|
||||
ForeachStmt *fes;
|
||||
ForeachActiveStmt *fas;
|
||||
ForeachUniqueStmt *fus;
|
||||
CaseStmt *cs;
|
||||
DefaultStmt *defs;
|
||||
SwitchStmt *ss;
|
||||
@@ -99,6 +103,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
PrintStmt *ps;
|
||||
AssertStmt *as;
|
||||
DeleteStmt *dels;
|
||||
UnmaskedStmt *ums;
|
||||
|
||||
if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
|
||||
es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
|
||||
@@ -135,6 +140,13 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
postFunc, data);
|
||||
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
|
||||
fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
|
||||
fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
|
||||
fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
|
||||
cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
|
||||
else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
|
||||
@@ -151,7 +163,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
|
||||
ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
|
||||
else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
|
||||
rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
|
||||
rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
|
||||
else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
|
||||
std::vector<Stmt *> &sls = sl->stmts;
|
||||
for (unsigned int i = 0; i < sls.size(); ++i)
|
||||
@@ -163,6 +175,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
|
||||
else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
|
||||
dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
|
||||
else if ((ums = dynamic_cast<UnmaskedStmt *>(node)) != NULL)
|
||||
ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
|
||||
else
|
||||
FATAL("Unhandled statement type in WalkAST()");
|
||||
}
|
||||
@@ -180,7 +194,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
MemberExpr *me;
|
||||
TypeCastExpr *tce;
|
||||
ReferenceExpr *re;
|
||||
DereferenceExpr *dre;
|
||||
PtrDerefExpr *ptrderef;
|
||||
RefDerefExpr *refderef;
|
||||
SizeOfExpr *soe;
|
||||
AddressOfExpr *aoe;
|
||||
NewExpr *newe;
|
||||
@@ -221,8 +236,12 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
|
||||
else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
|
||||
re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
|
||||
else if ((dre = dynamic_cast<DereferenceExpr *>(node)) != NULL)
|
||||
dre->expr = (Expr *)WalkAST(dre->expr, preFunc, postFunc, data);
|
||||
else if ((ptrderef = dynamic_cast<PtrDerefExpr *>(node)) != NULL)
|
||||
ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
|
||||
data);
|
||||
else if ((refderef = dynamic_cast<RefDerefExpr *>(node)) != NULL)
|
||||
refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
|
||||
data);
|
||||
else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
|
||||
soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
|
||||
else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
|
||||
@@ -300,19 +319,39 @@ TypeCheck(Stmt *stmt) {
|
||||
}
|
||||
|
||||
|
||||
struct CostData {
|
||||
CostData() { cost = foreachDepth = 0; }
|
||||
|
||||
int cost;
|
||||
int foreachDepth;
|
||||
};
|
||||
|
||||
|
||||
static bool
|
||||
lCostCallback(ASTNode *node, void *c) {
|
||||
int *cost = (int *)c;
|
||||
*cost += node->EstimateCost();
|
||||
lCostCallbackPre(ASTNode *node, void *d) {
|
||||
CostData *data = (CostData *)d;
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL)
|
||||
++data->foreachDepth;
|
||||
if (data->foreachDepth == 0)
|
||||
data->cost += node->EstimateCost();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static ASTNode *
|
||||
lCostCallbackPost(ASTNode *node, void *d) {
|
||||
CostData *data = (CostData *)d;
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL)
|
||||
--data->foreachDepth;
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
EstimateCost(ASTNode *root) {
|
||||
int cost = 0;
|
||||
WalkAST(root, lCostCallback, NULL, &cost);
|
||||
return cost;
|
||||
CostData data;
|
||||
WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
|
||||
return data.cost;
|
||||
}
|
||||
|
||||
|
||||
@@ -323,14 +362,22 @@ static bool
|
||||
lCheckAllOffSafety(ASTNode *node, void *data) {
|
||||
bool *okPtr = (bool *)data;
|
||||
|
||||
if (dynamic_cast<FunctionCallExpr *>(node) != NULL) {
|
||||
// FIXME: If we could somehow determine that the function being
|
||||
// called was safe (and all of the args Exprs were safe, then it'd
|
||||
// be nice to be able to return true here. (Consider a call to
|
||||
// e.g. floatbits() in the stdlib.) Unfortunately for now we just
|
||||
// have to be conservative.
|
||||
*okPtr = false;
|
||||
return false;
|
||||
FunctionCallExpr *fce;
|
||||
if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
|
||||
if (fce->func == NULL)
|
||||
return false;
|
||||
|
||||
const Type *type = fce->func->GetType();
|
||||
const PointerType *pt = CastType<PointerType>(type);
|
||||
if (pt != NULL)
|
||||
type = pt->GetBaseType();
|
||||
const FunctionType *ftype = CastType<FunctionType>(type);
|
||||
Assert(ftype != NULL);
|
||||
|
||||
if (ftype->isSafe == false) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dynamic_cast<AssertStmt *>(node) != NULL) {
|
||||
@@ -350,17 +397,29 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (g->target.allOffMaskIsSafe == true)
|
||||
// Don't worry about memory accesses if we have a target that can
|
||||
// safely run them with the mask all off
|
||||
return true;
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL ||
|
||||
dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
|
||||
dynamic_cast<ForeachUniqueStmt *>(node) != NULL ||
|
||||
dynamic_cast<UnmaskedStmt *>(node) != NULL) {
|
||||
// The various foreach statements also shouldn't be run with an
|
||||
// all-off mask. Since they can re-establish an 'all on' mask,
|
||||
// this would be pretty unintuitive. (More generally, it's
|
||||
// possibly a little strange to allow foreach in the presence of
|
||||
// any non-uniform control flow...)
|
||||
//
|
||||
// Similarly, the implementation of foreach_unique assumes as a
|
||||
// precondition that the mask won't be all off going into it, so
|
||||
// we'll enforce that here...
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
IndexExpr *ie;
|
||||
if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
|
||||
const Type *type = ie->baseExpr->GetType();
|
||||
if (type == NULL)
|
||||
return true;
|
||||
if (dynamic_cast<const ReferenceType *>(type) != NULL)
|
||||
if (CastType<ReferenceType>(type) != NULL)
|
||||
type = type->GetReferenceTarget();
|
||||
|
||||
ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
|
||||
@@ -370,16 +429,14 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const PointerType *pointerType =
|
||||
dynamic_cast<const PointerType *>(type);
|
||||
const PointerType *pointerType = CastType<PointerType>(type);
|
||||
if (pointerType != NULL) {
|
||||
// pointer[index] -> can't be sure -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
const SequentialType *seqType =
|
||||
dynamic_cast<const SequentialType *>(type);
|
||||
const SequentialType *seqType = CastType<SequentialType>(type);
|
||||
Assert(seqType != NULL);
|
||||
int nElements = seqType->GetElementCount();
|
||||
if (nElements == 0) {
|
||||
@@ -409,13 +466,9 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DereferenceExpr *de;
|
||||
if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
|
||||
const Type *exprType = de->expr->GetType();
|
||||
if (dynamic_cast<const PointerType *>(exprType) != NULL) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
if (dynamic_cast<PtrDerefExpr *>(node) != NULL) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
5
ast.h
5
ast.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -84,8 +84,7 @@ class AST {
|
||||
public:
|
||||
/** Add the AST for a function described by the given declaration
|
||||
information and source code. */
|
||||
void AddFunction(Symbol *sym, const std::vector<Symbol *> &args,
|
||||
Stmt *code);
|
||||
void AddFunction(Symbol *sym, Stmt *code);
|
||||
|
||||
/** Generate LLVM IR for all of the functions into the current
|
||||
module. */
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
|
||||
REM If LLVM_INSTALL_DIR isn't set globally in your environment,
|
||||
REM it can be set here_
|
||||
set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
|
||||
set LLVM_VERSION=3.1svn
|
||||
REM set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
|
||||
REM set LLVM_VERSION=3.2
|
||||
|
||||
REM Both the LLVM binaries and python need to be in the path
|
||||
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
|
||||
|
||||
190
builtins.cpp
190
builtins.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -47,12 +47,25 @@
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#if defined(LLVM_3_2)
|
||||
#include <llvm/Attributes.h>
|
||||
#endif
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#else
|
||||
#include <llvm/IR/Attributes.h>
|
||||
#include <llvm/IR/LLVMContext.h>
|
||||
#include <llvm/IR/Module.h>
|
||||
#include <llvm/IR/Type.h>
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#include <llvm/IR/Intrinsics.h>
|
||||
#include <llvm/IR/DerivedTypes.h>
|
||||
#endif
|
||||
#include <llvm/Linker.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/ADT/Triple.h>
|
||||
@@ -157,7 +170,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
|
||||
static void
|
||||
lCreateSymbol(const std::string &name, const Type *returnType,
|
||||
const std::vector<const Type *> &argTypes,
|
||||
llvm::SmallVector<const Type *, 8> &argTypes,
|
||||
const llvm::FunctionType *ftype, llvm::Function *func,
|
||||
SymbolTable *symbolTable) {
|
||||
SourcePos noPos;
|
||||
@@ -199,7 +212,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
// bool, so just have a one-off override for that one...
|
||||
if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
|
||||
const Type *returnType = AtomicType::VaryingInt32;
|
||||
std::vector<const Type *> argTypes;
|
||||
llvm::SmallVector<const Type *, 8> argTypes;
|
||||
argTypes.push_back(AtomicType::VaryingBool);
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
@@ -229,7 +242,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
// Iterate over the arguments and try to find their equivalent ispc
|
||||
// types. Track if any of the arguments has an integer type.
|
||||
bool anyIntArgs = false;
|
||||
std::vector<const Type *> argTypes;
|
||||
llvm::SmallVector<const Type *, 8> argTypes;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
|
||||
const llvm::Type *llvmArgType = ftype->getParamType(j);
|
||||
const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
|
||||
@@ -291,7 +304,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
|
||||
llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
|
||||
Assert(id != 0);
|
||||
LLVM_TYPE_CONST llvm::Type *intrinsicType =
|
||||
llvm::Type *intrinsicType =
|
||||
llvm::Intrinsic::getType(*g->ctx, id);
|
||||
intrinsicType = llvm::PointerType::get(intrinsicType, 0);
|
||||
Assert(func->getType() == intrinsicType);
|
||||
@@ -322,6 +335,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__add_varying_double",
|
||||
"__add_varying_int32",
|
||||
"__add_varying_int64",
|
||||
"__all",
|
||||
"__any",
|
||||
"__aos_to_soa3_float",
|
||||
"__aos_to_soa3_float16",
|
||||
"__aos_to_soa3_float4",
|
||||
@@ -411,12 +426,17 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__extract_int64",
|
||||
"__extract_int8",
|
||||
"__fastmath",
|
||||
"__float_to_half_uniform",
|
||||
"__float_to_half_varying",
|
||||
"__floatbits_uniform_int32",
|
||||
"__floatbits_varying_int32",
|
||||
"__floor_uniform_double",
|
||||
"__floor_uniform_float",
|
||||
"__floor_varying_double",
|
||||
"__floor_varying_float",
|
||||
"__get_system_isa",
|
||||
"__half_to_float_uniform",
|
||||
"__half_to_float_varying",
|
||||
"__insert_int16",
|
||||
"__insert_int32",
|
||||
"__insert_int64",
|
||||
@@ -438,6 +458,12 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__max_varying_uint32",
|
||||
"__max_varying_uint64",
|
||||
"__memory_barrier",
|
||||
"__memcpy32",
|
||||
"__memcpy64",
|
||||
"__memmove32",
|
||||
"__memmove64",
|
||||
"__memset32",
|
||||
"__memset64",
|
||||
"__min_uniform_double",
|
||||
"__min_uniform_float",
|
||||
"__min_uniform_int32",
|
||||
@@ -454,9 +480,11 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__new_uniform",
|
||||
"__new_varying32",
|
||||
"__new_varying64",
|
||||
"__none",
|
||||
"__num_cores",
|
||||
"__packed_load_active",
|
||||
"__packed_store_active",
|
||||
"__pause",
|
||||
"__popcnt_int32",
|
||||
"__popcnt_int64",
|
||||
"__prefetch_read_uniform_1",
|
||||
@@ -465,12 +493,13 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__prefetch_read_uniform_nt",
|
||||
"__rcp_uniform_float",
|
||||
"__rcp_varying_float",
|
||||
"__rdrand_i16",
|
||||
"__rdrand_i32",
|
||||
"__rdrand_i64",
|
||||
"__reduce_add_double",
|
||||
"__reduce_add_float",
|
||||
"__reduce_add_int32",
|
||||
"__reduce_add_int64",
|
||||
"__reduce_add_uint32",
|
||||
"__reduce_add_uint64",
|
||||
"__reduce_equal_double",
|
||||
"__reduce_equal_float",
|
||||
"__reduce_equal_int32",
|
||||
@@ -499,6 +528,7 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__round_varying_float",
|
||||
"__rsqrt_uniform_float",
|
||||
"__rsqrt_varying_float",
|
||||
"__set_system_isa",
|
||||
"__sext_uniform_bool",
|
||||
"__sext_varying_bool",
|
||||
"__shuffle2_double",
|
||||
@@ -527,6 +557,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__sqrt_uniform_float",
|
||||
"__sqrt_varying_double",
|
||||
"__sqrt_varying_float",
|
||||
"__stdlib_acosf",
|
||||
"__stdlib_asinf",
|
||||
"__stdlib_atan",
|
||||
"__stdlib_atan2",
|
||||
"__stdlib_atan2f",
|
||||
@@ -606,11 +638,18 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
mTriple.getVendor() == bcTriple.getVendor());
|
||||
bcModule->setTargetTriple(mTriple.str());
|
||||
|
||||
// This is also suboptimal; LLVM issues a warning about linking
|
||||
// modules with different datalayouts, due to things like
|
||||
// bulitins-c.c having the regular IA layout, but the generic
|
||||
// targets having a layout with 16-bit alignment for 16xi1 vectors.
|
||||
// As long as builtins-c.c doesn't have any 16xi1 vector types
|
||||
// (which it shouldn't!), then this override is safe.
|
||||
if (g->target.isa == Target::GENERIC)
|
||||
bcModule->setDataLayout(module->getDataLayout());
|
||||
|
||||
std::string(linkError);
|
||||
if (llvm::Linker::LinkModules(module, bcModule,
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
llvm::Linker::DestroySource,
|
||||
#endif // LLVM_3_0
|
||||
&linkError))
|
||||
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
||||
lSetInternalFunctions(module);
|
||||
@@ -627,15 +666,37 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
static void
|
||||
lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
|
||||
SC_STATIC);
|
||||
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
Symbol *sym =
|
||||
new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
|
||||
SC_STATIC);
|
||||
sym->constValue = new ConstExpr(sym->type, val, SourcePos());
|
||||
llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
llvm::Constant *linit = LLVMInt32(val);
|
||||
pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
linit, pw->name.c_str());
|
||||
symbolTable->AddVariable(pw);
|
||||
// Use WeakODRLinkage rather than InternalLinkage so that a definition
|
||||
// survives even if it's not used in the module, so that the symbol is
|
||||
// there in the debugger.
|
||||
llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
|
||||
llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
|
||||
sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
|
||||
linit, name);
|
||||
symbolTable->AddVariable(sym);
|
||||
|
||||
if (m->diBuilder != NULL) {
|
||||
llvm::DIFile file;
|
||||
llvm::DIType diType = sym->type->GetDIType(file);
|
||||
Assert(diType.Verify());
|
||||
// FIXME? DWARF says that this (and programIndex below) should
|
||||
// have the DW_AT_artifical attribute. It's not clear if this
|
||||
// matters for anything though.
|
||||
llvm::DIGlobalVariable var =
|
||||
m->diBuilder->createGlobalVariable(name,
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
true /* static */,
|
||||
sym->storagePtr);
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -643,13 +704,17 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
static void
|
||||
lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
std::vector<const Type *> args;
|
||||
llvm::SmallVector<const Type *, 8> args;
|
||||
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||
Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
|
||||
|
||||
llvm::Function *func = module->getFunction(name);
|
||||
Assert(func != NULL); // it should be declared already...
|
||||
#if defined(LLVM_3_2)
|
||||
func->addFnAttr(llvm::Attributes::AlwaysInline);
|
||||
#else
|
||||
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
#endif
|
||||
llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
|
||||
llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
|
||||
|
||||
@@ -661,20 +726,37 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
|
||||
static void
|
||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||
AtomicType::VaryingConstInt32, SC_STATIC);
|
||||
Symbol *sym =
|
||||
new Symbol("programIndex", SourcePos(),
|
||||
AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);
|
||||
|
||||
int pi[ISPC_MAX_NVEC];
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
pi[i] = i;
|
||||
pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
|
||||
sym->constValue = new ConstExpr(sym->type, pi, SourcePos());
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
llvm::Constant *linit = LLVMInt32Vector(pi);
|
||||
pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage, linit,
|
||||
pidx->name.c_str());
|
||||
symbolTable->AddVariable(pidx);
|
||||
// See comment in lDefineConstantInt() for why WeakODRLinkage is used here
|
||||
llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
|
||||
llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
|
||||
sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
|
||||
linit, sym->name.c_str());
|
||||
symbolTable->AddVariable(sym);
|
||||
|
||||
if (m->diBuilder != NULL) {
|
||||
llvm::DIFile file;
|
||||
llvm::DIType diType = sym->type->GetDIType(file);
|
||||
Assert(diType.Verify());
|
||||
llvm::DIGlobalVariable var =
|
||||
m->diBuilder->createGlobalVariable(sym->name.c_str(),
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
false /* static */,
|
||||
sym->storagePtr);
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -756,6 +838,26 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::AVX11:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx11[];
|
||||
extern int builtins_bitcode_avx11_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx11,
|
||||
builtins_bitcode_avx11_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx11_x2[];
|
||||
extern int builtins_bitcode_avx11_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx11_x2,
|
||||
builtins_bitcode_avx11_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::AVX2:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
@@ -799,6 +901,20 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
builtins_bitcode_generic_16_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 32:
|
||||
extern unsigned char builtins_bitcode_generic_32[];
|
||||
extern int builtins_bitcode_generic_32_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_32,
|
||||
builtins_bitcode_generic_32_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 64:
|
||||
extern unsigned char builtins_bitcode_generic_64[];
|
||||
extern int builtins_bitcode_generic_64_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_64,
|
||||
builtins_bitcode_generic_64_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 1:
|
||||
extern unsigned char builtins_bitcode_generic_1[];
|
||||
extern int builtins_bitcode_generic_1_length;
|
||||
@@ -831,10 +947,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
symbolTable);
|
||||
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
||||
symbolTable);
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
|
||||
symbolTable);
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
|
||||
module, symbolTable);
|
||||
|
||||
lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
|
||||
lDefineConstantInt("__have_native_half", g->target.hasHalf, module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__have_native_rand", g->target.hasRand, module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
|
||||
module, symbolTable);
|
||||
|
||||
if (includeStdlibISPC) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -59,22 +59,39 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef int Bool;
|
||||
|
||||
#define PRINT_SCALAR(fmt, type) \
|
||||
printf(fmt, *((type *)ptr)); \
|
||||
#define PRINT_BUF_SIZE 4096
|
||||
|
||||
#define APPEND(str) \
|
||||
do { \
|
||||
int offset = bufp - &printString[0]; \
|
||||
*bufp = '\0'; \
|
||||
strncat(bufp, str, PRINT_BUF_SIZE-offset); \
|
||||
bufp += strlen(str); \
|
||||
if (bufp >= &printString[PRINT_BUF_SIZE]) \
|
||||
goto done; \
|
||||
} while (0) /* eat semicolon */
|
||||
|
||||
|
||||
#define PRINT_SCALAR(fmt, type) \
|
||||
sprintf(tmpBuf, fmt, *((type *)ptr)); \
|
||||
APPEND(tmpBuf); \
|
||||
break
|
||||
|
||||
#define PRINT_VECTOR(fmt, type) \
|
||||
putchar('['); \
|
||||
*bufp++ = '['; \
|
||||
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
|
||||
for (int i = 0; i < width; ++i) { \
|
||||
/* only print the value if the current lane is executing */ \
|
||||
if (mask & (1<<i)) \
|
||||
printf(fmt, ((type *)ptr)[i]); \
|
||||
if (mask & (1ull<<i)) \
|
||||
sprintf(tmpBuf, fmt, ((type *)ptr)[i]); \
|
||||
else \
|
||||
printf("((" fmt "))", ((type *)ptr)[i]); \
|
||||
putchar(i != width-1 ? ',' : ']'); \
|
||||
sprintf(tmpBuf, "((" fmt "))", ((type *)ptr)[i]); \
|
||||
APPEND(tmpBuf); \
|
||||
*bufp++ = (i != width-1 ? ',' : ']'); \
|
||||
} \
|
||||
break
|
||||
|
||||
@@ -89,16 +106,18 @@ typedef int Bool;
|
||||
@param mask Current lane mask when the print statemnt is called
|
||||
@param args Array of pointers to the values to be printed
|
||||
*/
|
||||
void __do_print(const char *format, const char *types, int width, int mask,
|
||||
void __do_print(const char *format, const char *types, int width, uint64_t mask,
|
||||
void **args) {
|
||||
if (mask == 0)
|
||||
return;
|
||||
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||
char *bufp = &printString[0];
|
||||
char tmpBuf[256];
|
||||
|
||||
int argCount = 0;
|
||||
while (*format) {
|
||||
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||
// Format strings are just single percent signs.
|
||||
if (*format != '%')
|
||||
putchar(*format);
|
||||
if (*format != '%') {
|
||||
*bufp++ = *format;
|
||||
}
|
||||
else {
|
||||
if (*types) {
|
||||
void *ptr = args[argCount++];
|
||||
@@ -107,17 +126,22 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
// printf() formatting string.
|
||||
switch (*types) {
|
||||
case 'b': {
|
||||
printf("%s", *((Bool *)ptr) ? "true" : "false");
|
||||
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
break;
|
||||
}
|
||||
case 'B': {
|
||||
putchar('[');
|
||||
*bufp++ = '[';
|
||||
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||
break;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
if (mask & (1<<i))
|
||||
printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
if (mask & (1ull << i)) {
|
||||
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
}
|
||||
else
|
||||
printf("_________");
|
||||
putchar(i != width-1 ? ',' : ']');
|
||||
APPEND("_________");
|
||||
*bufp++ = (i != width-1) ? ',' : ']';
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -136,14 +160,18 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
case 'p': PRINT_SCALAR("%p", void *);
|
||||
case 'P': PRINT_VECTOR("%p", void *);
|
||||
default:
|
||||
printf("UNKNOWN TYPE ");
|
||||
putchar(*types);
|
||||
APPEND("UNKNOWN TYPE ");
|
||||
*bufp++ = *types;
|
||||
}
|
||||
++types;
|
||||
}
|
||||
}
|
||||
++format;
|
||||
}
|
||||
|
||||
done:
|
||||
*bufp = '\0';
|
||||
fputs(printString, stdout);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
@@ -48,8 +48,8 @@ declare void @abort() noreturn
|
||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
||||
;; most capable ISA that the curremt system can run.
|
||||
;;
|
||||
;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
|
||||
;; backwards compatibility for anyone building ispc with LLVM 2.9.
|
||||
;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
|
||||
;; backwards compatibility for anyone building ispc with LLVM 3.0
|
||||
;;
|
||||
;; #include <stdint.h>
|
||||
;; #include <stdlib.h>
|
||||
@@ -76,13 +76,19 @@ declare void @abort() noreturn
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0) {
|
||||
;; // AVX1 for sure. Do we have AVX2?
|
||||
;; // Call cpuid with eax=7, ecx=0
|
||||
;; __cpuid_count(info, 7, 0);
|
||||
;; if ((info[1] & (1 << 5)) != 0)
|
||||
;; return 3; // AVX2
|
||||
;; else
|
||||
;; return 2; // AVX1
|
||||
;; if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
;; (info[2] & (1 << 30)) != 0) { // RDRAND
|
||||
;; // So far, so good. AVX2?
|
||||
;; // Call cpuid with eax=7, ecx=0
|
||||
;; int info2[4];
|
||||
;; __cpuid_count(info2, 7, 0);
|
||||
;; if ((info2[1] & (1 << 5)) != 0)
|
||||
;; return 4;
|
||||
;; else
|
||||
;; return 3;
|
||||
;; }
|
||||
;; // Regular AVX
|
||||
;; return 2;
|
||||
;; }
|
||||
;; else if ((info[2] & (1 << 19)) != 0)
|
||||
;; return 1; // SSE4
|
||||
@@ -92,41 +98,44 @@ declare void @abort() noreturn
|
||||
;; abort();
|
||||
;; }
|
||||
|
||||
%0 = type { i32, i32, i32, i32 }
|
||||
|
||||
define i32 @__get_system_isa() nounwind ssp {
|
||||
define i32 @__get_system_isa() nounwind uwtable ssp {
|
||||
entry:
|
||||
%0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%asmresult9.i = extractvalue %0 %0, 2
|
||||
%asmresult10.i = extractvalue %0 %0, 3
|
||||
%and = and i32 %asmresult9.i, 268435456
|
||||
%0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
|
||||
%asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
|
||||
%and = and i32 %asmresult5.i, 268435456
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.else7, label %if.then
|
||||
br i1 %cmp, label %if.else13, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult9.i24 = extractvalue %0 %1, 1
|
||||
%and4 = lshr i32 %asmresult9.i24, 5
|
||||
%2 = and i32 %and4, 1
|
||||
%3 = or i32 %2, 2
|
||||
%1 = and i32 %asmresult5.i, 1610612736
|
||||
%2 = icmp eq i32 %1, 1610612736
|
||||
br i1 %2, label %if.then7, label %return
|
||||
|
||||
if.then7: ; preds = %if.then
|
||||
%3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
|
||||
%and10 = lshr i32 %asmresult4.i28, 5
|
||||
%4 = and i32 %and10, 1
|
||||
%5 = add i32 %4, 3
|
||||
br label %return
|
||||
|
||||
if.else7: ; preds = %entry
|
||||
%and10 = and i32 %asmresult9.i, 524288
|
||||
%cmp11 = icmp eq i32 %and10, 0
|
||||
br i1 %cmp11, label %if.else13, label %return
|
||||
if.else13: ; preds = %entry
|
||||
%and15 = and i32 %asmresult5.i, 524288
|
||||
%cmp16 = icmp eq i32 %and15, 0
|
||||
br i1 %cmp16, label %if.else18, label %return
|
||||
|
||||
if.else13: ; preds = %if.else7
|
||||
%and16 = and i32 %asmresult10.i, 67108864
|
||||
%cmp17 = icmp eq i32 %and16, 0
|
||||
br i1 %cmp17, label %if.else19, label %return
|
||||
if.else18: ; preds = %if.else13
|
||||
%and20 = and i32 %asmresult6.i, 67108864
|
||||
%cmp21 = icmp eq i32 %and20, 0
|
||||
br i1 %cmp21, label %if.else23, label %return
|
||||
|
||||
if.else19: ; preds = %if.else13
|
||||
if.else23: ; preds = %if.else18
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
return: ; preds = %if.else13, %if.else7, %if.then
|
||||
%retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
|
||||
return: ; preds = %if.else18, %if.else13, %if.then7, %if.then
|
||||
%retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
|
||||
ret i32 %retval.0
|
||||
}
|
||||
|
||||
|
||||
@@ -254,10 +254,10 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -158,13 +158,13 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__max_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
define <16 x float> @__min_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
@@ -175,7 +175,7 @@ define <16 x float> @__min_varying_float(<16 x float>,
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -186,9 +186,57 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
|
||||
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
|
||||
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
%cmp = icmp eq i32 %v, 65535
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
|
||||
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal float ops
|
||||
|
||||
@@ -224,7 +272,7 @@ reduce_equal(16)
|
||||
;; horizontal int32 ops
|
||||
|
||||
define <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||
<16 x i32>) nounwind readnone alwaysinline {
|
||||
<16 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i32> %0, %1
|
||||
ret <16 x i32> %s
|
||||
}
|
||||
@@ -252,11 +300,6 @@ define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
@@ -334,11 +377,6 @@ define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
@@ -352,19 +390,14 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(16, i8, 8)
|
||||
load_and_broadcast(16, i16, 16)
|
||||
load_and_broadcast(16, i32, 32)
|
||||
load_and_broadcast(16, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
masked_load(16, i8, 8, 1)
|
||||
masked_load(16, i16, 16, 2)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %mask to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -382,7 +415,7 @@ define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
@@ -416,6 +449,7 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
ret <16 x i64> %val
|
||||
}
|
||||
|
||||
masked_load_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
@@ -423,15 +457,15 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(16, i8, 8)
|
||||
gen_masked_store(16, i16, 16)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <16 x i32> * %0 to i8 *
|
||||
%val = bitcast <16 x i32> %1 to <16 x float>
|
||||
%mask = bitcast <16 x i32> %2 to <16 x float>
|
||||
@@ -453,8 +487,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <16 x i64> * %0 to i8 *
|
||||
%val = bitcast <16 x i64> %1 to <16 x double>
|
||||
|
||||
@@ -492,14 +526,15 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
masked_store_blend_8_16_by_16()
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
|
||||
%oldValue = load <16 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
|
||||
@@ -536,8 +571,8 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||
<4 x double>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <16 x i64>* %ptr, align 8
|
||||
%old = bitcast <16 x i64> %oldValue to <16 x double>
|
||||
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
@@ -597,10 +632,12 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; scatter
|
||||
|
||||
gen_scatter(16, i8)
|
||||
gen_scatter(16, i16)
|
||||
gen_scatter(16, i32)
|
||||
gen_scatter(16, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -158,13 +158,13 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__max_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__min_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
@@ -175,10 +175,32 @@ define <8 x float> @__min_varying_float(<8 x float>,
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 255
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -239,11 +261,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
@@ -315,11 +332,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<8 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
@@ -333,19 +345,15 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
masked_load(8, i8, 8, 1)
|
||||
masked_load(8, i16, 16, 2)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %mask to <8 x float>
|
||||
%floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
|
||||
%retval = bitcast <8 x float> %floatval to <8 x i32>
|
||||
@@ -353,7 +361,7 @@ define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
@@ -372,19 +380,20 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
ret <8 x i64> %val
|
||||
}
|
||||
|
||||
masked_load_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <8 x i32> * %0 to i8 *
|
||||
%val = bitcast <8 x i32> %1 to <8 x float>
|
||||
%mask = bitcast <8 x i32> %2 to <8 x float>
|
||||
@@ -392,8 +401,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <8 x i64> * %0 to i8 *
|
||||
%val = bitcast <8 x i64> %1 to <8 x double>
|
||||
|
||||
@@ -417,14 +426,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
}
|
||||
|
||||
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||
%oldValue = load <8 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||
@@ -438,8 +446,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
%mask = bitcast <8 x i32> %i32mask to <8 x float>
|
||||
|
||||
@@ -488,14 +496,17 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; scatter
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
@@ -61,17 +63,19 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
ifelse(NO_HALF_DECLARES, `1', `', `
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
@@ -61,15 +63,19 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
ifelse(NO_HALF_DECLARES, `1', `', `
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
132
builtins/target-avx11-x2.ll
Normal file
132
builtins/target-avx11-x2.ll
Normal file
@@ -0,0 +1,132 @@
|
||||
;; Copyright (c) 2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
|
||||
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
|
||||
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
|
||||
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
|
||||
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
'
|
||||
)
|
||||
115
builtins/target-avx11.ll
Normal file
115
builtins/target-avx11.ll
Normal file
@@ -0,0 +1,115 @@
|
||||
;; Copyright (c) 2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
|
||||
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
')
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,8 +29,16 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `',
|
||||
LLVM_VERSION, `LLVM_3_1', `',
|
||||
`define(`HAVE_GATHER', `1')')
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
@@ -66,6 +74,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
@@ -116,14 +127,435 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
declare void @llvm.trap() noreturn nounwind
|
||||
|
||||
; $1: type
|
||||
; $2: var base name
|
||||
define(`extract_4s', `
|
||||
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: type
|
||||
; $2: var base name
|
||||
define(`extract_8s', `
|
||||
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: element type
|
||||
; $2: ret name
|
||||
; $3: v1
|
||||
; $4: v2
|
||||
define(`assemble_8s', `
|
||||
%$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: element type
|
||||
; $2: ret name
|
||||
; $3: v1
|
||||
; $4: v2
|
||||
; $5: v3
|
||||
; $6: v4
|
||||
define(`assemble_4s', `
|
||||
%$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
assemble_8s($1, $2, $2_1, $2_2)
|
||||
')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)',
|
||||
LLVM_VERSION, `LLVM_3_1', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)', `
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 gathers
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
extract_8s(i32, offsets)
|
||||
extract_8s(i32, vecmask)
|
||||
|
||||
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
|
||||
|
||||
assemble_8s(i32, v, v1, v2)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
|
||||
extract_4s(i32, vecmask)
|
||||
extract_4s(i64, offsets)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i32, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather32_i32(<16 x i32> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_8s(i32, ptrs)
|
||||
extract_8s(i32, vecmask)
|
||||
|
||||
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
|
||||
|
||||
assemble_8s(i32, v, v1, v2)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather64_i32(<16 x i64> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i32, vecmask)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i32, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float gathers
|
||||
|
||||
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
|
||||
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
|
||||
|
||||
define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_8s(i32, offsets)
|
||||
extract_8s(float, mask)
|
||||
|
||||
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
|
||||
|
||||
assemble_8s(float, v, v1, v2)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
|
||||
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
|
||||
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(float, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather32_float(<16 x i32> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_8s(float, mask)
|
||||
extract_8s(i32, ptrs)
|
||||
|
||||
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
|
||||
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
|
||||
|
||||
assemble_8s(float, v, v1, v2)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather64_float(<16 x i64> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
|
||||
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
|
||||
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
|
||||
|
||||
assemble_4s(float, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int64 gathers
|
||||
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__gather32_i64(<16 x i32> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
define <16 x i64> @__gather64_i64(<16 x i64> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double gathers
|
||||
|
||||
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather32_double(<16 x i32> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather64_double(<16 x i64> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
')
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,8 +29,16 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `',
|
||||
LLVM_VERSION, `LLVM_3_1', `',
|
||||
`define(`HAVE_GATHER', `1')')
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
@@ -66,6 +74,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
@@ -100,11 +111,323 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
declare void @llvm.trap() noreturn nounwind
|
||||
|
||||
define(`extract_4s', `
|
||||
%$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)',
|
||||
LLVM_VERSION, `LLVM_3_1', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)', `
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 gathers
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
|
||||
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
|
||||
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
extract_4s(i32, vecmask)
|
||||
extract_4s(i64, offsets)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather32_i32(<8 x i32> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather64_i32(<8 x i64> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i32, vecmask)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float gathers
|
||||
|
||||
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
|
||||
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
|
||||
|
||||
define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
|
||||
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
|
||||
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather32_float(<8 x i32> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
|
||||
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs, <8 x float> %mask, i8 1)
|
||||
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather64_float(<8 x i64> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int64 gathers
|
||||
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather32_i64(<8 x i32> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather64_i64(<8 x i64> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double gathers
|
||||
|
||||
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather32_double(<8 x i32> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather64_double(<8 x i64> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
')
|
||||
|
||||
100
builtins/target-generic-1.ll
Executable file → Normal file
100
builtins/target-generic-1.ll
Executable file → Normal file
@@ -13,42 +13,44 @@ aossoa()
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(1, i8, 8)
|
||||
gen_masked_store(1, i16, 16)
|
||||
gen_masked_store(1, i32, 32)
|
||||
gen_masked_store(1, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(1, i8, 8)
|
||||
load_and_broadcast(1, i16, 16)
|
||||
load_and_broadcast(1, i32, 32)
|
||||
load_and_broadcast(1, i64, 64)
|
||||
|
||||
masked_load(1, i8, 8, 1)
|
||||
masked_load(1, i16, 16, 2)
|
||||
masked_load(1, i32, 32, 4)
|
||||
masked_load(1, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(1, i8)
|
||||
gen_gather(1, i16)
|
||||
gen_gather(1, i32)
|
||||
gen_gather(1, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(1, i8)
|
||||
gen_scatter(1, i16)
|
||||
gen_scatter(1, i32)
|
||||
gen_scatter(1, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
|
||||
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %mv = trunc <1 x i32> %mask to <1 x i8>
|
||||
; %notmask = xor <1 x i8> %mv, <i8 -1>
|
||||
; %cleared_old = and <1 x i8> %0, %notmask
|
||||
@@ -69,7 +71,7 @@ define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
|
||||
}
|
||||
|
||||
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %mv = trunc <1 x i32> %mask to <1 x i16>
|
||||
; %notmask = xor <1 x i16> %mv, <i16 -1>
|
||||
; %cleared_old = and <1 x i16> %0, %notmask
|
||||
@@ -91,7 +93,7 @@ define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
|
||||
|
||||
|
||||
define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %notmask = xor <1 x i32> %mask, <i32 -1>
|
||||
; %cleared_old = and <1 x i32> %0, %notmask
|
||||
; %masked_new = and <1 x i32> %1, %mask
|
||||
@@ -109,8 +111,9 @@ define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
|
||||
ret <1 x i32> %r
|
||||
|
||||
}
|
||||
|
||||
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %newmask = zext <1 x i32> %mask to <1 x i64>
|
||||
; %notmask = xor <1 x i64> %newmask, <i64 -1>
|
||||
; %cleared_old = and <1 x i64> %0, %notmask
|
||||
@@ -131,7 +134,7 @@ define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
|
||||
}
|
||||
|
||||
define <1 x float> @__vselect_float(<1 x float>, <1 x float>,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %v0 = bitcast <1 x float> %0 to <1 x i32>
|
||||
; %v1 = bitcast <1 x float> %1 to <1 x i32>
|
||||
; %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
|
||||
@@ -154,23 +157,23 @@ define <1 x float> @__vselect_float(<1 x float>, <1 x float>,
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>,
|
||||
define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i8> * %0, align 4
|
||||
%newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask)
|
||||
store <1 x i8> %newval, <1 x i8> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
|
||||
define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i16> * %0, align 4
|
||||
%newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask)
|
||||
store <1 x i16> %newval, <1 x i16> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
|
||||
define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i32> * %0, align 4
|
||||
%newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask)
|
||||
@@ -178,20 +181,43 @@ define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i64> * %0, align 4
|
||||
%newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask)
|
||||
store <1 x i64> %newval, <1 x i64> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
|
||||
masked_store_float_double()
|
||||
|
||||
define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
%cmp = icmp eq i32 %v, 1
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -476,11 +502,6 @@ define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<1 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
@@ -932,4 +953,3 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
|
||||
33
builtins/target-generic-32.ll
Normal file
33
builtins/target-generic-32.ll
Normal file
@@ -0,0 +1,33 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`32')
|
||||
include(`target-generic-common.ll')
|
||||
33
builtins/target-generic-64.ll
Normal file
33
builtins/target-generic-64.ll
Normal file
@@ -0,0 +1,33 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`64')
|
||||
include(`target-generic-common.ll')
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,12 +29,18 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";
|
||||
|
||||
define(`MASK',`i1')
|
||||
define(`HAVE_GATHER',`1')
|
||||
define(`HAVE_SCATTER',`1')
|
||||
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
scans()
|
||||
reduce_equal(WIDTH)
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; broadcast/rotate/shuffle
|
||||
@@ -46,6 +52,20 @@ declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
|
||||
declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
|
||||
declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__setzero_float() nounwind readnone
|
||||
declare <WIDTH x double> @__setzero_double() nounwind readnone
|
||||
declare <WIDTH x i8> @__setzero_i8() nounwind readnone
|
||||
declare <WIDTH x i16> @__setzero_i16() nounwind readnone
|
||||
declare <WIDTH x i32> @__setzero_i32() nounwind readnone
|
||||
declare <WIDTH x i64> @__setzero_i64() nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__undef_float() nounwind readnone
|
||||
declare <WIDTH x double> @__undef_double() nounwind readnone
|
||||
declare <WIDTH x i8> @__undef_i8() nounwind readnone
|
||||
declare <WIDTH x i16> @__undef_i16() nounwind readnone
|
||||
declare <WIDTH x i32> @__undef_i32() nounwind readnone
|
||||
declare <WIDTH x i64> @__undef_i64() nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
|
||||
declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
|
||||
declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
|
||||
@@ -201,7 +221,10 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reductions
|
||||
|
||||
declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone
|
||||
declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone
|
||||
declare i1 @__any(<WIDTH x i1>) nounwind readnone
|
||||
declare i1 @__all(<WIDTH x i1>) nounwind readnone
|
||||
declare i1 @__none(<WIDTH x i1>) nounwind readnone
|
||||
|
||||
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
|
||||
@@ -211,7 +234,6 @@ declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
|
||||
|
||||
declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
|
||||
|
||||
@@ -223,34 +245,48 @@ declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
|
||||
|
||||
declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(WIDTH, i8, 8)
|
||||
load_and_broadcast(WIDTH, i16, 16)
|
||||
load_and_broadcast(WIDTH, i32, 32)
|
||||
load_and_broadcast(WIDTH, i64, 64)
|
||||
|
||||
declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
|
||||
declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_1svn',`
|
||||
define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
', `
|
||||
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i8> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
|
||||
@@ -258,57 +294,64 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i16> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
|
||||
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i32> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
|
||||
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
|
||||
define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x float> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
|
||||
store <WIDTH x float> %v1, <WIDTH x float> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
|
||||
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i64> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
|
||||
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
|
||||
ret void
|
||||
}
|
||||
',`
|
||||
declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
|
||||
define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
|
||||
<WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x double> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
|
||||
store <WIDTH x double> %v1, <WIDTH x double> * %0
|
||||
ret void
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
define(`gather_scatter', `
|
||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
|
||||
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
|
||||
@@ -318,7 +361,9 @@ declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
|
||||
gather_scatter(i8)
|
||||
gather_scatter(i16)
|
||||
gather_scatter(i32)
|
||||
gather_scatter(float)
|
||||
gather_scatter(i64)
|
||||
gather_scatter(double)
|
||||
|
||||
declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
|
||||
<WIDTH x i1>) nounwind
|
||||
|
||||
@@ -33,6 +33,7 @@ ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
@@ -309,7 +309,62 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp eq i32 %v, 255
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define <4 x float> @__vec4_add_float(<4 x float> %v0,
|
||||
@@ -360,11 +415,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
@@ -397,7 +447,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
}
|
||||
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
@@ -432,28 +482,30 @@ reduce_equal(8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
masked_load(8, i8, 8, 1)
|
||||
masked_load(8, i16, 16, 2)
|
||||
masked_load(8, i32, 32, 4)
|
||||
masked_load(8, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float rounding
|
||||
@@ -557,23 +609,23 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(8, i32, 32)
|
||||
gen_masked_store(8, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <8 x i32> * %0, align 4
|
||||
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
|
||||
store <8 x i32> %newval, <8 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
|
||||
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||
@@ -616,6 +668,8 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -239,10 +239,32 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 15
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
@@ -281,18 +303,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
@@ -349,16 +366,16 @@ reduce_equal(4)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <4 x i32> * %0, align 4
|
||||
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
|
||||
store <4 x i32> %newval, <4 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
|
||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||
@@ -400,6 +417,8 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
}
|
||||
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
@@ -551,35 +570,37 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
masked_load(4, i8, 8, 1)
|
||||
masked_load(4, i16, 16, 2)
|
||||
masked_load(4, i32, 32, 4)
|
||||
masked_load(4, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
@@ -33,6 +33,7 @@ ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -221,13 +221,13 @@ define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly al
|
||||
; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
@@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
@@ -251,7 +251,62 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp eq i32 %v, 255
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
@@ -287,11 +342,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
|
||||
}
|
||||
@@ -324,7 +374,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
}
|
||||
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
@@ -359,28 +409,30 @@ reduce_equal(8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
masked_load(8, i8, 8, 1)
|
||||
masked_load(8, i16, 16, 2)
|
||||
masked_load(8, i32, 32, 4)
|
||||
masked_load(8, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float rounding
|
||||
@@ -443,18 +495,18 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(8, i32, 32)
|
||||
gen_masked_store(8, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
; do two 4-wide blends with blendvps
|
||||
%mask_as_float = bitcast <8 x i32> %mask to <8 x float>
|
||||
%mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
|
||||
@@ -483,8 +535,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
; implement this as 4 blends of <4 x i32>s, which are actually bitcast
|
||||
; <2 x i64>s...
|
||||
|
||||
@@ -550,6 +602,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -271,10 +271,32 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 15
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
@@ -312,18 +334,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
@@ -383,8 +400,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
||||
%oldValue = load <4 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||
@@ -398,8 +415,8 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
||||
|
||||
@@ -450,35 +467,39 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
masked_load(4, i8, 8, 1)
|
||||
masked_load(4, i16, 16, 2)
|
||||
masked_load(4, i32, 32, 4)
|
||||
masked_load(4, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
1723
builtins/util.m4
1723
builtins/util.m4
File diff suppressed because it is too large
Load Diff
3036
cbackend.cpp
3036
cbackend.cpp
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,7 @@ syn keyword ispcStatement cbreak ccontinue creturn launch print reference soa sy
|
||||
syn keyword ispcConditional cif
|
||||
syn keyword ispcRepeat cdo cfor cwhile
|
||||
syn keyword ispcBuiltin programCount programIndex
|
||||
syn keyword ispcType export int8 int16 int32 int64
|
||||
syn keyword ispcType export uniform varying int8 int16 int32 int64
|
||||
|
||||
" Default highlighting
|
||||
command -nargs=+ HiLink hi def link <args>
|
||||
|
||||
8
contrib/ispc.vim.README
Normal file
8
contrib/ispc.vim.README
Normal file
@@ -0,0 +1,8 @@
|
||||
To install vim syntax highlighting for ispc files:
|
||||
|
||||
1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
|
||||
2) Create a filetype for ispc files to correspond to that syntax file
|
||||
To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
|
||||
|
||||
au BufRead,BufNewFile *.ispc set filetype=ispc
|
||||
|
||||
117
ctx.h
117
ctx.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -40,10 +40,20 @@
|
||||
|
||||
#include "ispc.h"
|
||||
#include <map>
|
||||
#include <llvm/InstrTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
||||
#include <llvm/InstrTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#else
|
||||
#include <llvm/IR/InstrTypes.h>
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#endif
|
||||
#if defined(LLVM_3_1)
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#else
|
||||
#include <llvm/DebugInfo.h>
|
||||
#include <llvm/DIBuilder.h>
|
||||
#endif
|
||||
|
||||
struct CFInfo;
|
||||
|
||||
@@ -153,16 +163,17 @@ public:
|
||||
bool uniformControlFlow);
|
||||
|
||||
/** Informs FunctionEmitContext of the value of the mask at the start
|
||||
of a loop body. */
|
||||
void SetLoopMask(llvm::Value *mask);
|
||||
of a loop body or switch statement. */
|
||||
void SetBlockEntryMask(llvm::Value *mask);
|
||||
|
||||
/** Informs FunctionEmitContext that code generation for a loop is
|
||||
finished. */
|
||||
void EndLoop();
|
||||
|
||||
/** Indicates that code generation for a 'foreach' or 'foreach_tiled'
|
||||
loop is about to start. */
|
||||
void StartForeach();
|
||||
/** Indicates that code generation for a 'foreach', 'foreach_tiled',
|
||||
'foreach_active', or 'foreach_unique' loop is about to start. */
|
||||
enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
|
||||
void StartForeach(ForeachType ft);
|
||||
void EndForeach();
|
||||
|
||||
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
|
||||
@@ -230,6 +241,13 @@ public:
|
||||
|
||||
bool InForeachLoop() const;
|
||||
|
||||
/** Temporarily disables emission of performance warnings from gathers
|
||||
and scatters from subsequent code. */
|
||||
void DisableGatherScatterWarnings();
|
||||
|
||||
/** Reenables emission of gather/scatter performance warnings. */
|
||||
void EnableGatherScatterWarnings();
|
||||
|
||||
void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
|
||||
|
||||
/** Step through the code and find label statements; create a basic
|
||||
@@ -241,6 +259,10 @@ public:
|
||||
new basic block that it starts. */
|
||||
llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
|
||||
|
||||
/** Returns a vector of all labels in the context. This is
|
||||
simply the key set of the labelMap */
|
||||
std::vector<std::string> GetLabels();
|
||||
|
||||
/** Called to generate code for 'return' statement; value is the
|
||||
expression in the return statement (if non-NULL), and
|
||||
doCoherenceCheck indicates whether instructions should be generated
|
||||
@@ -265,7 +287,7 @@ public:
|
||||
llvm::Value *None(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i32 value wherein the i'th bit is on if and only if the i'th lane
|
||||
i64 value wherein the i'th bit is on if and only if the i'th lane
|
||||
of the mask is on. */
|
||||
llvm::Value *LaneMask(llvm::Value *mask);
|
||||
|
||||
@@ -331,7 +353,7 @@ public:
|
||||
|
||||
/** Emits debugging information for the function parameter represented
|
||||
by sym. */
|
||||
void EmitFunctionParameterDebugInfo(Symbol *sym);
|
||||
void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
|
||||
/** @} */
|
||||
|
||||
/** @name IR instruction emission
|
||||
@@ -373,25 +395,35 @@ public:
|
||||
array, for pointer types). */
|
||||
llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
|
||||
|
||||
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
/** Given two integer-typed values (but possibly one vector and the
|
||||
other not, and or of possibly-different bit-widths), update their
|
||||
values as needed so that the two have the same (more general)
|
||||
type. */
|
||||
void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
|
||||
|
||||
/** Create a new slice pointer out of the given pointer to an soa type
|
||||
and an integer offset to a slice within that type. */
|
||||
llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
|
||||
|
||||
/** These GEP methods are generalizations of the standard ones in LLVM;
|
||||
they support both uniform and varying basePtr values as well as
|
||||
uniform and varying index values (arrays of indices). Varying base
|
||||
@@ -412,7 +444,8 @@ public:
|
||||
the type of the pointer, though it may be NULL if the base pointer
|
||||
is uniform. */
|
||||
llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
|
||||
const Type *ptrType, const char *name = NULL);
|
||||
const Type *ptrType, const char *name = NULL,
|
||||
const PointerType **resultPtrType = NULL);
|
||||
|
||||
/** Load from the memory location(s) given by lvalue, using the given
|
||||
mask. The lvalue may be varying, in which case this corresponds to
|
||||
@@ -430,7 +463,7 @@ public:
|
||||
instruction is added at the start of the function in the entry
|
||||
basic block; if it should be added to the current basic block, then
|
||||
the atEntryBlock parameter should be false. */
|
||||
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType,
|
||||
llvm::Value *AllocaInst(llvm::Type *llvmType,
|
||||
const char *name = NULL, int align = 0,
|
||||
bool atEntryBlock = true);
|
||||
|
||||
@@ -443,7 +476,14 @@ public:
|
||||
varying, the given storeMask is used to mask the stores so that
|
||||
they only execute for the active program instances. */
|
||||
void StoreInst(llvm::Value *value, llvm::Value *ptr,
|
||||
llvm::Value *storeMask, const Type *ptrType);
|
||||
llvm::Value *storeMask, const Type *valueType,
|
||||
const Type *ptrType);
|
||||
|
||||
/** Copy count bytes of memory from the location pointed to by src to
|
||||
the location pointed to by dest. (src and dest must not be
|
||||
overlapping.) */
|
||||
void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
|
||||
llvm::Value *align = NULL);
|
||||
|
||||
void BranchInst(llvm::BasicBlock *block);
|
||||
void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
|
||||
@@ -460,7 +500,7 @@ public:
|
||||
llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
||||
llvm::PHINode *PhiNode(llvm::Type *type, int count,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
|
||||
llvm::Value *val1, const char *name = NULL);
|
||||
@@ -531,9 +571,9 @@ private:
|
||||
for error messages and debugging symbols. */
|
||||
SourcePos funcStartPos;
|
||||
|
||||
/** If currently in a loop body, the value of the mask at the start of
|
||||
the loop. */
|
||||
llvm::Value *loopMask;
|
||||
/** If currently in a loop body or switch statement, the value of the
|
||||
mask at the start of it. */
|
||||
llvm::Value *blockEntryMask;
|
||||
|
||||
/** If currently in a loop body or switch statement, this is a pointer
|
||||
to memory to store a mask value that represents which of the lanes
|
||||
@@ -607,12 +647,12 @@ private:
|
||||
std::vector<CFInfo *> controlFlowInfo;
|
||||
|
||||
/** DIFile object corresponding to the source file where the current
|
||||
function was defined (used for debugging info0. */
|
||||
function was defined (used for debugging info). */
|
||||
llvm::DIFile diFile;
|
||||
|
||||
/** DISubprogram corresponding to this function (used for debugging
|
||||
info). */
|
||||
llvm::DISubprogram diFunction;
|
||||
llvm::DISubprogram diSubprogram;
|
||||
|
||||
/** These correspond to the current set of nested scopes in the
|
||||
function. */
|
||||
@@ -626,6 +666,10 @@ private:
|
||||
tasks launched from the current function. */
|
||||
llvm::Value *launchGroupHandlePtr;
|
||||
|
||||
/** Nesting count of the number of times calling code has disabled (and
|
||||
not yet reenabled) gather/scatter performance warnings. */
|
||||
int disableGSWarningCount;
|
||||
|
||||
std::map<std::string, llvm::BasicBlock *> labelMap;
|
||||
|
||||
static bool initLabelBBlocks(ASTNode *node, void *data);
|
||||
@@ -646,12 +690,19 @@ private:
|
||||
|
||||
CFInfo *popCFState();
|
||||
|
||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
|
||||
const Type *ptrType, llvm::Value *mask);
|
||||
void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
|
||||
const char *name);
|
||||
void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr,
|
||||
llvm::Value *mask, const Type *valueType,
|
||||
const PointerType *ptrType);
|
||||
llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
|
||||
const PointerType *ptrType, const char *name);
|
||||
|
||||
llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
|
||||
llvm::Value *mask, const char *name);
|
||||
|
||||
llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
|
||||
};
|
||||
|
||||
|
||||
590
decl.cpp
590
decl.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -33,7 +33,7 @@
|
||||
|
||||
/** @file decl.cpp
|
||||
@brief Implementations of classes related to turning declarations into
|
||||
symbols and types.
|
||||
symbol names and types.
|
||||
*/
|
||||
|
||||
#include "decl.h"
|
||||
@@ -44,6 +44,7 @@
|
||||
#include "stmt.h"
|
||||
#include "expr.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <set>
|
||||
|
||||
static void
|
||||
@@ -55,6 +56,8 @@ lPrintTypeQualifiers(int typeQualifiers) {
|
||||
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
|
||||
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
if (typeQualifiers & TYPEQUAL_EXPORT) printf("export ");
|
||||
if (typeQualifiers & TYPEQUAL_UNMASKED) printf("unmasked ");
|
||||
}
|
||||
|
||||
|
||||
@@ -69,12 +72,21 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
|
||||
type = type->GetAsUniformType();
|
||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
|
||||
type = type->GetAsVaryingType();
|
||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
|
||||
if (Type::Equal(type, AtomicType::Void))
|
||||
Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
|
||||
else
|
||||
type = type->GetAsUniformType();
|
||||
}
|
||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
|
||||
if (Type::Equal(type, AtomicType::Void))
|
||||
Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
|
||||
else
|
||||
type = type->GetAsVaryingType();
|
||||
}
|
||||
else
|
||||
type = type->GetAsUnboundVariabilityType();
|
||||
if (Type::Equal(type, AtomicType::Void) == false)
|
||||
type = type->GetAsUnboundVariabilityType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
|
||||
@@ -84,15 +96,20 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
const Type *unsignedType = type->GetAsUnsignedType();
|
||||
if (unsignedType != NULL)
|
||||
type = unsignedType;
|
||||
else
|
||||
else {
|
||||
const Type *resolvedType =
|
||||
type->ResolveUnboundVariability(Variability::Varying);
|
||||
Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
||||
type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
|
||||
resolvedType->GetString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
|
||||
const Type *resolvedType =
|
||||
type->ResolveUnboundVariability(Variability::Varying);
|
||||
Error(pos, "\"signed\" qualifier is illegal with non-integer type "
|
||||
"\"%s\".",
|
||||
type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
|
||||
"\"%s\".", resolvedType->GetString().c_str());
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
@@ -112,18 +129,59 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
|
||||
|
||||
const Type *
|
||||
DeclSpecs::GetBaseType(SourcePos pos) const {
|
||||
const Type *bt = baseType;
|
||||
const Type *retType = baseType;
|
||||
|
||||
if (retType == NULL) {
|
||||
Warning(pos, "No type specified in declaration. Assuming int32.");
|
||||
retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
|
||||
}
|
||||
|
||||
if (vectorSize > 0) {
|
||||
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
|
||||
const AtomicType *atomicType = CastType<AtomicType>(retType);
|
||||
if (atomicType == NULL) {
|
||||
Error(pos, "Only atomic types (int, float, ...) are legal for vector "
|
||||
"types.");
|
||||
return NULL;
|
||||
}
|
||||
bt = new VectorType(atomicType, vectorSize);
|
||||
retType = new VectorType(atomicType, vectorSize);
|
||||
}
|
||||
|
||||
return lApplyTypeQualifiers(typeQualifiers, bt, pos);
|
||||
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
|
||||
|
||||
if (soaWidth > 0) {
|
||||
const StructType *st = CastType<StructType>(retType);
|
||||
|
||||
if (st == NULL) {
|
||||
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||
"type \"%s\".", soaWidth, retType->GetString().c_str());
|
||||
return NULL;
|
||||
}
|
||||
else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
|
||||
Error(pos, "soa<%d> width illegal. Value must be positive power "
|
||||
"of two.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (st->IsUniformType()) {
|
||||
Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
|
||||
"both be used in a type declaration.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else if (st->IsVaryingType()) {
|
||||
Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
|
||||
"both be used in a type declaration.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
retType = st->GetAsSOAType(soaWidth);
|
||||
|
||||
if (soaWidth < g->target.vectorWidth)
|
||||
PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
|
||||
"currently leads to inefficient code to access "
|
||||
"soa types.", soaWidth, g->target.vectorWidth);
|
||||
}
|
||||
|
||||
return retType;
|
||||
}
|
||||
|
||||
|
||||
@@ -133,7 +191,6 @@ lGetStorageClassName(StorageClass storageClass) {
|
||||
case SC_NONE: return "";
|
||||
case SC_EXTERN: return "extern";
|
||||
case SC_EXTERN_C: return "extern \"C\"";
|
||||
case SC_EXPORT: return "export";
|
||||
case SC_STATIC: return "static";
|
||||
case SC_TYPEDEF: return "typedef";
|
||||
default: FATAL("Unhandled storage class in lGetStorageClassName");
|
||||
@@ -162,31 +219,30 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
|
||||
: pos(p), kind(dk) {
|
||||
child = NULL;
|
||||
typeQualifiers = 0;
|
||||
storageClass = SC_NONE;
|
||||
arraySize = -1;
|
||||
sym = NULL;
|
||||
type = NULL;
|
||||
initExpr = NULL;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
|
||||
const Type *t = GetType(ds);
|
||||
Symbol *sym = GetSymbol();
|
||||
if (sym != NULL) {
|
||||
sym->type = t;
|
||||
sym->storageClass = ds->storageClass;
|
||||
const Type *baseType = ds->GetBaseType(pos);
|
||||
InitFromType(baseType, ds);
|
||||
|
||||
if (type == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
storageClass = ds->storageClass;
|
||||
|
||||
Symbol *
|
||||
Declarator::GetSymbol() const {
|
||||
// The symbol lives at the last child in the chain, so walk down there
|
||||
// and return the one there.
|
||||
const Declarator *d = this;
|
||||
while (d->child != NULL)
|
||||
d = d->child;
|
||||
return d->sym;
|
||||
if (ds->declSpecList.size() > 0 &&
|
||||
CastType<FunctionType>(type) == NULL) {
|
||||
Error(pos, "__declspec specifiers for non-function type \"%s\" are "
|
||||
"not used.", type->GetString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -196,11 +252,11 @@ Declarator::Print(int indent) const {
|
||||
pos.Print();
|
||||
|
||||
lPrintTypeQualifiers(typeQualifiers);
|
||||
Symbol *sym = GetSymbol();
|
||||
if (sym != NULL)
|
||||
printf("%s", sym->name.c_str());
|
||||
printf("%s ", lGetStorageClassName(storageClass));
|
||||
if (name.size() > 0)
|
||||
printf("%s", name.c_str());
|
||||
else
|
||||
printf("(null symbol)");
|
||||
printf("(unnamed)");
|
||||
|
||||
printf(", array size = %d", arraySize);
|
||||
|
||||
@@ -234,132 +290,157 @@ Declarator::Print(int indent) const {
|
||||
}
|
||||
|
||||
|
||||
Symbol *
|
||||
Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
|
||||
const FunctionType *type =
|
||||
dynamic_cast<const FunctionType *>(GetType(ds));
|
||||
if (type == NULL)
|
||||
return NULL;
|
||||
|
||||
Symbol *declSym = GetSymbol();
|
||||
Assert(declSym != NULL);
|
||||
|
||||
// Get the symbol for the function from the symbol table. (It should
|
||||
// already have been added to the symbol table by AddGlobal() by the
|
||||
// time we get here.)
|
||||
Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
|
||||
if (funSym != NULL)
|
||||
// May be NULL due to error earlier in compilation
|
||||
funSym->pos = pos;
|
||||
|
||||
// Walk down to the declarator for the function. (We have to get past
|
||||
// the stuff that specifies the function's return type before we get to
|
||||
// the function's declarator.)
|
||||
Declarator *d = this;
|
||||
while (d != NULL && d->kind != DK_FUNCTION)
|
||||
d = d->child;
|
||||
Assert(d != NULL);
|
||||
|
||||
for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
|
||||
Symbol *sym = d->GetSymbolForFunctionParameter(i);
|
||||
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
|
||||
funArgs->push_back(sym);
|
||||
}
|
||||
|
||||
if (funSym != NULL)
|
||||
funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
|
||||
|
||||
return funSym;
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
void
|
||||
Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
||||
bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
|
||||
bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
|
||||
bool isTask = ((typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isExported = ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
||||
bool isConst = ((typeQualifiers & TYPEQUAL_CONST) != 0);
|
||||
bool isUnmasked = ((typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
||||
|
||||
if (hasUniformQual && hasVaryingQual) {
|
||||
Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isTask)
|
||||
if (kind != DK_FUNCTION && isTask) {
|
||||
Error(pos, "\"task\" qualifier illegal in variable declaration.");
|
||||
|
||||
Type::Variability variability = Type::Unbound;
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isUnmasked) {
|
||||
Error(pos, "\"unmasked\" qualifier illegal in variable declaration.");
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isExported) {
|
||||
Error(pos, "\"export\" qualifier illegal in variable declaration.");
|
||||
return;
|
||||
}
|
||||
|
||||
Variability variability(Variability::Unbound);
|
||||
if (hasUniformQual)
|
||||
variability = Type::Uniform;
|
||||
variability = Variability::Uniform;
|
||||
else if (hasVaryingQual)
|
||||
variability = Type::Varying;
|
||||
variability = Variability::Varying;
|
||||
|
||||
const Type *type = base;
|
||||
switch (kind) {
|
||||
case DK_BASE:
|
||||
if (kind == DK_BASE) {
|
||||
// All of the type qualifiers should be in the DeclSpecs for the
|
||||
// base declarator
|
||||
Assert(typeQualifiers == 0);
|
||||
Assert(child == NULL);
|
||||
return type;
|
||||
|
||||
case DK_POINTER:
|
||||
type = new PointerType(type, variability, isConst);
|
||||
if (child != NULL)
|
||||
return child->GetType(type, ds);
|
||||
AssertPos(pos, typeQualifiers == 0);
|
||||
AssertPos(pos, child == NULL);
|
||||
type = baseType;
|
||||
}
|
||||
else if (kind == DK_POINTER) {
|
||||
/* For now, any pointer to an SOA type gets the slice property; if
|
||||
we add the capability to declare pointers as slices or not,
|
||||
we'll want to set this based on a type qualifier here. */
|
||||
const Type *ptrType = new PointerType(baseType, variability, isConst,
|
||||
baseType->IsSOAType());
|
||||
if (child != NULL) {
|
||||
child->InitFromType(ptrType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
|
||||
case DK_REFERENCE:
|
||||
if (hasUniformQual)
|
||||
type = ptrType;
|
||||
}
|
||||
else if (kind == DK_REFERENCE) {
|
||||
if (hasUniformQual) {
|
||||
Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
|
||||
if (hasVaryingQual)
|
||||
return;
|
||||
}
|
||||
if (hasVaryingQual) {
|
||||
Error(pos, "\"varying\" qualifier is illegal to apply to references.");
|
||||
if (isConst)
|
||||
return;
|
||||
}
|
||||
if (isConst) {
|
||||
Error(pos, "\"const\" qualifier is to illegal apply to references.");
|
||||
|
||||
return;
|
||||
}
|
||||
// The parser should disallow this already, but double check.
|
||||
if (dynamic_cast<const ReferenceType *>(type) != NULL) {
|
||||
if (CastType<ReferenceType>(baseType) != NULL) {
|
||||
Error(pos, "References to references are illegal.");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
type = new ReferenceType(type);
|
||||
if (child != NULL)
|
||||
return child->GetType(type, ds);
|
||||
const Type *refType = new ReferenceType(baseType);
|
||||
if (child != NULL) {
|
||||
child->InitFromType(refType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
type = refType;
|
||||
}
|
||||
else if (kind == DK_ARRAY) {
|
||||
if (Type::Equal(baseType, AtomicType::Void)) {
|
||||
Error(pos, "Arrays of \"void\" type are illegal.");
|
||||
return;
|
||||
}
|
||||
if (CastType<ReferenceType>(baseType)) {
|
||||
Error(pos, "Arrays of references (type \"%s\") are illegal.",
|
||||
baseType->GetString().c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
case DK_ARRAY:
|
||||
type = new ArrayType(type, arraySize);
|
||||
if (child)
|
||||
return child->GetType(type, ds);
|
||||
const Type *arrayType = new ArrayType(baseType, arraySize);
|
||||
if (child != NULL) {
|
||||
child->InitFromType(arrayType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
|
||||
case DK_FUNCTION: {
|
||||
std::vector<const Type *> args;
|
||||
std::vector<std::string> argNames;
|
||||
std::vector<ConstExpr *> argDefaults;
|
||||
std::vector<SourcePos> argPos;
|
||||
|
||||
type = arrayType;
|
||||
}
|
||||
else if (kind == DK_FUNCTION) {
|
||||
llvm::SmallVector<const Type *, 8> args;
|
||||
llvm::SmallVector<std::string, 8> argNames;
|
||||
llvm::SmallVector<Expr *, 8> argDefaults;
|
||||
llvm::SmallVector<SourcePos, 8> argPos;
|
||||
|
||||
// Loop over the function arguments and store the names, types,
|
||||
// default values (if any), and source file positions each one in
|
||||
// the corresponding vector.
|
||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
||||
Declaration *d = functionParams[i];
|
||||
|
||||
Symbol *sym = GetSymbolForFunctionParameter(i);
|
||||
if (d == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for the
|
||||
// parameter; wire up a placeholder Declarator for it
|
||||
d->declarators.push_back(new Declarator(DK_BASE, pos));
|
||||
d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
|
||||
}
|
||||
|
||||
AssertPos(pos, d->declarators.size() == 1);
|
||||
Declarator *decl = d->declarators[0];
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (decl->name == "") {
|
||||
// Give a name to any anonymous parameter declarations
|
||||
char buf[32];
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
decl->name = buf;
|
||||
}
|
||||
decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
|
||||
|
||||
if (d->declSpecs->storageClass != SC_NONE)
|
||||
Error(sym->pos, "Storage class \"%s\" is illegal in "
|
||||
Error(decl->pos, "Storage class \"%s\" is illegal in "
|
||||
"function parameter declaration for parameter \"%s\".",
|
||||
lGetStorageClassName(d->declSpecs->storageClass),
|
||||
sym->name.c_str());
|
||||
decl->name.c_str());
|
||||
if (Type::Equal(decl->type, AtomicType::Void)) {
|
||||
Error(decl->pos, "Parameter with type \"void\" illegal in function "
|
||||
"parameter list.");
|
||||
decl->type = NULL;
|
||||
}
|
||||
|
||||
const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
|
||||
const ArrayType *at = CastType<ArrayType>(decl->type);
|
||||
if (at != NULL) {
|
||||
// As in C, arrays are passed to functions as pointers to
|
||||
// their element type. We'll just immediately make this
|
||||
@@ -369,144 +450,124 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
// report this differently than it was originally declared
|
||||
// in the function, but it's not clear that this is a
|
||||
// significant problem.)
|
||||
sym->type = PointerType::GetUniform(at->GetElementType());
|
||||
const Type *targetType = at->GetElementType();
|
||||
if (targetType == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
decl->type = PointerType::GetUniform(targetType, at->IsSOAType());
|
||||
|
||||
// Make sure there are no unsized arrays (other than the
|
||||
// first dimension) in function parameter lists.
|
||||
at = dynamic_cast<const ArrayType *>(at->GetElementType());
|
||||
at = CastType<ArrayType>(targetType);
|
||||
while (at != NULL) {
|
||||
if (at->GetElementCount() == 0)
|
||||
Error(sym->pos, "Arrays with unsized dimensions in "
|
||||
Error(decl->pos, "Arrays with unsized dimensions in "
|
||||
"dimensions after the first one are illegal in "
|
||||
"function parameter lists.");
|
||||
at = dynamic_cast<const ArrayType *>(at->GetElementType());
|
||||
at = CastType<ArrayType>(at->GetElementType());
|
||||
}
|
||||
}
|
||||
|
||||
args.push_back(sym->type);
|
||||
argNames.push_back(sym->name);
|
||||
argPos.push_back(sym->pos);
|
||||
args.push_back(decl->type);
|
||||
argNames.push_back(decl->name);
|
||||
argPos.push_back(decl->pos);
|
||||
|
||||
ConstExpr *init = NULL;
|
||||
if (d->declarators.size()) {
|
||||
// Try to find an initializer expression; if there is one,
|
||||
// it lives down to the base declarator.
|
||||
Declarator *decl = d->declarators[0];
|
||||
while (decl->child != NULL) {
|
||||
Assert(decl->initExpr == NULL);
|
||||
Expr *init = NULL;
|
||||
// Try to find an initializer expression.
|
||||
while (decl != NULL) {
|
||||
if (decl->initExpr != NULL) {
|
||||
decl->initExpr = TypeCheck(decl->initExpr);
|
||||
decl->initExpr = Optimize(decl->initExpr);
|
||||
if (decl->initExpr != NULL) {
|
||||
init = dynamic_cast<ConstExpr *>(decl->initExpr);
|
||||
if (init == NULL)
|
||||
init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
|
||||
if (init == NULL)
|
||||
Error(decl->initExpr->pos, "Default value for parameter "
|
||||
"\"%s\" must be a compile-time constant.",
|
||||
decl->name.c_str());
|
||||
}
|
||||
break;
|
||||
}
|
||||
else
|
||||
decl = decl->child;
|
||||
}
|
||||
|
||||
if (decl->initExpr != NULL &&
|
||||
(decl->initExpr = TypeCheck(decl->initExpr)) != NULL &&
|
||||
(decl->initExpr = Optimize(decl->initExpr)) != NULL &&
|
||||
(init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
|
||||
Error(decl->initExpr->pos, "Default value for parameter "
|
||||
"\"%s\" must be a compile-time constant.",
|
||||
sym->name.c_str());
|
||||
}
|
||||
}
|
||||
argDefaults.push_back(init);
|
||||
}
|
||||
|
||||
const Type *returnType = type;
|
||||
const Type *returnType = baseType;
|
||||
if (returnType == NULL) {
|
||||
Error(pos, "No return type provided in function declaration.");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
if (CastType<FunctionType>(returnType) != NULL) {
|
||||
Error(pos, "Illegal to return function type from function.");
|
||||
return;
|
||||
}
|
||||
|
||||
bool isExported = ds && (ds->storageClass == SC_EXPORT);
|
||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
|
||||
|
||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
||||
bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
||||
|
||||
if (isExported && isTask) {
|
||||
Error(pos, "Function can't have both \"task\" and \"export\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (isExternC && isTask) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (isExternC && isExported) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (isUnmasked && isExported)
|
||||
Warning(pos, "\"unmasked\" qualifier is redundant for exported "
|
||||
"functions.");
|
||||
|
||||
if (child == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
const Type *functionType =
|
||||
const FunctionType *functionType =
|
||||
new FunctionType(returnType, args, argNames, argDefaults,
|
||||
argPos, isTask, isExported, isExternC);
|
||||
functionType = functionType->ResolveUnboundVariability(Type::Varying);
|
||||
return child->GetType(functionType, ds);
|
||||
}
|
||||
default:
|
||||
FATAL("Unexpected decl kind");
|
||||
return NULL;
|
||||
}
|
||||
argPos, isTask, isExported, isExternC, isUnmasked);
|
||||
|
||||
#if 0
|
||||
// Make sure we actually have an array of structs ..
|
||||
const StructType *childStructType =
|
||||
dynamic_cast<const StructType *>(childType);
|
||||
if (childStructType == NULL) {
|
||||
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||
"type \"%s\".", soaWidth, childType->GetString().c_str());
|
||||
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
||||
// handle any explicit __declspecs on the function
|
||||
if (ds != NULL) {
|
||||
for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
|
||||
std::string str = ds->declSpecList[i].first;
|
||||
SourcePos pos = ds->declSpecList[i].second;
|
||||
|
||||
if (str == "safe")
|
||||
(const_cast<FunctionType *>(functionType))->isSafe = true;
|
||||
else if (!strncmp(str.c_str(), "cost", 4)) {
|
||||
int cost = atoi(str.c_str() + 4);
|
||||
if (cost < 0)
|
||||
Error(pos, "Negative function cost %d is illegal.",
|
||||
cost);
|
||||
(const_cast<FunctionType *>(functionType))->costOverride = cost;
|
||||
}
|
||||
else
|
||||
Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
|
||||
}
|
||||
else if ((soaWidth & (soaWidth - 1)) != 0) {
|
||||
Error(pos, "soa<%d> width illegal. Value must be power of two.",
|
||||
soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
|
||||
Error(pos, "soa<%d> width must evenly divide array size %d.",
|
||||
soaWidth, arraySize);
|
||||
return NULL;
|
||||
}
|
||||
return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
|
||||
soaWidth);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Declarator::GetType(DeclSpecs *ds) const {
|
||||
const Type *baseType = ds->GetBaseType(pos);
|
||||
const Type *type = GetType(baseType, ds);
|
||||
return type;
|
||||
}
|
||||
|
||||
|
||||
Symbol *
|
||||
Declarator::GetSymbolForFunctionParameter(int paramNum) const {
|
||||
Assert(paramNum < (int)functionParams.size());
|
||||
Declaration *d = functionParams[paramNum];
|
||||
|
||||
char buf[32];
|
||||
Symbol *sym;
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for
|
||||
// the parameter
|
||||
sprintf(buf, "__anon_parameter_%d", paramNum);
|
||||
sym = new Symbol(buf, pos);
|
||||
sym->type = d->declSpecs->GetBaseType(pos);
|
||||
}
|
||||
else {
|
||||
Assert(d->declarators.size() == 1);
|
||||
sym = d->declarators[0]->GetSymbol();
|
||||
if (sym == NULL) {
|
||||
// Handle more complex anonymous declarations like
|
||||
// float (float **).
|
||||
sprintf(buf, "__anon_parameter_%d", paramNum);
|
||||
sym = new Symbol(buf, d->declarators[0]->pos);
|
||||
sym->type = d->declarators[0]->GetType(d->declSpecs);
|
||||
}
|
||||
}
|
||||
return sym;
|
||||
}
|
||||
|
||||
child->InitFromType(functionType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Declaration
|
||||
@@ -537,18 +598,23 @@ Declaration::GetVariableDeclarations() const {
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL)
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
// Ignore earlier errors
|
||||
Assert(m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
Symbol *sym = decl->GetSymbol();
|
||||
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
|
||||
|
||||
if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
|
||||
if (Type::Equal(decl->type, AtomicType::Void))
|
||||
Error(decl->pos, "\"void\" type variable illegal in declaration.");
|
||||
else if (CastType<FunctionType>(decl->type) == NULL) {
|
||||
decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
|
||||
Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
|
||||
decl->storageClass);
|
||||
m->symbolTable->AddVariable(sym);
|
||||
vars.push_back(VariableDeclaration(sym, decl->initExpr));
|
||||
}
|
||||
}
|
||||
|
||||
return vars;
|
||||
}
|
||||
|
||||
@@ -559,18 +625,19 @@ Declaration::DeclareFunctions() {
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL)
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
// Ignore earlier errors
|
||||
Assert(m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
Symbol *sym = decl->GetSymbol();
|
||||
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
|
||||
|
||||
if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
|
||||
const FunctionType *ftype = CastType<FunctionType>(decl->type);
|
||||
if (ftype == NULL)
|
||||
continue;
|
||||
|
||||
bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
|
||||
m->AddFunctionDeclaration(sym, isInline);
|
||||
m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
|
||||
isInline, decl->pos);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -584,13 +651,14 @@ Declaration::Print(int indent) const {
|
||||
declarators[i]->Print(indent+4);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void
|
||||
GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions) {
|
||||
llvm::SmallVector<const Type *, 8> *elementTypes,
|
||||
llvm::SmallVector<std::string, 8> *elementNames,
|
||||
llvm::SmallVector<SourcePos, 8> *elementPositions) {
|
||||
std::set<std::string> seenNames;
|
||||
for (unsigned int i = 0; i < sd.size(); ++i) {
|
||||
const Type *type = sd[i]->type;
|
||||
@@ -600,35 +668,41 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
// FIXME: making this fake little DeclSpecs here is really
|
||||
// disgusting
|
||||
DeclSpecs ds(type);
|
||||
if (type->IsUniformType())
|
||||
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
|
||||
else if (type->IsVaryingType())
|
||||
ds.typeQualifiers |= TYPEQUAL_VARYING;
|
||||
if (Type::Equal(type, AtomicType::Void) == false) {
|
||||
if (type->IsUniformType())
|
||||
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
|
||||
else if (type->IsVaryingType())
|
||||
ds.typeQualifiers |= TYPEQUAL_VARYING;
|
||||
else if (type->GetSOAWidth() != 0)
|
||||
ds.soaWidth = type->GetSOAWidth();
|
||||
// FIXME: ds.vectorSize?
|
||||
}
|
||||
|
||||
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
|
||||
Declarator *d = (*sd[i]->declarators)[j];
|
||||
d->InitFromDeclSpecs(&ds);
|
||||
|
||||
Symbol *sym = d->GetSymbol();
|
||||
if (Type::Equal(d->type, AtomicType::Void))
|
||||
Error(d->pos, "\"void\" type illegal for struct member.");
|
||||
|
||||
const ArrayType *arrayType =
|
||||
dynamic_cast<const ArrayType *>(sym->type);
|
||||
if (arrayType != NULL && arrayType->GetElementCount() == 0) {
|
||||
Error(d->pos, "Unsized arrays aren't allowed in struct "
|
||||
"definitions.");
|
||||
elementTypes->push_back(NULL);
|
||||
}
|
||||
else
|
||||
elementTypes->push_back(sym->type);
|
||||
elementTypes->push_back(d->type);
|
||||
|
||||
if (seenNames.find(sym->name) != seenNames.end())
|
||||
if (seenNames.find(d->name) != seenNames.end())
|
||||
Error(d->pos, "Struct member \"%s\" has same name as a "
|
||||
"previously-declared member.", sym->name.c_str());
|
||||
"previously-declared member.", d->name.c_str());
|
||||
else
|
||||
seenNames.insert(sym->name);
|
||||
seenNames.insert(d->name);
|
||||
|
||||
elementNames->push_back(sym->name);
|
||||
elementPositions->push_back(sym->pos);
|
||||
elementNames->push_back(d->name);
|
||||
elementPositions->push_back(d->pos);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
|
||||
const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
|
||||
|
||||
if (arrayType != NULL && arrayType->GetElementCount() == 0)
|
||||
Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
|
||||
"for the last member in a struct definition.");
|
||||
}
|
||||
}
|
||||
|
||||
60
decl.h
60
decl.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -47,30 +47,21 @@
|
||||
variables--here, that the declaration has the 'static' and 'uniform'
|
||||
qualifiers, and that it's basic type is 'int'. Then for each variable
|
||||
declaration, the Declaraiton class holds an instance of a Declarator,
|
||||
which in turn records the per-variable information like the symbol
|
||||
name, array size (if any), initializer expression, etc.
|
||||
which in turn records the per-variable information like the name, array
|
||||
size (if any), initializer expression, etc.
|
||||
*/
|
||||
|
||||
#ifndef ISPC_DECL_H
|
||||
#define ISPC_DECL_H
|
||||
|
||||
#include "ispc.h"
|
||||
#include <llvm/ADT/SmallVector.h>
|
||||
|
||||
struct VariableDeclaration;
|
||||
|
||||
class Declaration;
|
||||
class Declarator;
|
||||
|
||||
enum StorageClass {
|
||||
SC_NONE,
|
||||
SC_EXTERN,
|
||||
SC_EXPORT,
|
||||
SC_STATIC,
|
||||
SC_TYPEDEF,
|
||||
SC_EXTERN_C
|
||||
};
|
||||
|
||||
|
||||
/* Multiple qualifiers can be provided with types in declarations;
|
||||
therefore, they are set up so that they can be ANDed together into an
|
||||
int. */
|
||||
@@ -82,6 +73,8 @@ enum StorageClass {
|
||||
#define TYPEQUAL_SIGNED (1<<4)
|
||||
#define TYPEQUAL_UNSIGNED (1<<5)
|
||||
#define TYPEQUAL_INLINE (1<<6)
|
||||
#define TYPEQUAL_EXPORT (1<<7)
|
||||
#define TYPEQUAL_UNMASKED (1<<8)
|
||||
|
||||
/** @brief Representation of the declaration specifiers in a declaration.
|
||||
|
||||
@@ -90,7 +83,8 @@ enum StorageClass {
|
||||
*/
|
||||
class DeclSpecs {
|
||||
public:
|
||||
DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
|
||||
DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
|
||||
int tq = TYPEQUAL_NONE);
|
||||
|
||||
void Print() const;
|
||||
|
||||
@@ -117,6 +111,8 @@ public:
|
||||
SOA width specified. Otherwise this is zero.
|
||||
*/
|
||||
int soaWidth;
|
||||
|
||||
std::vector<std::pair<std::string, SourcePos> > declSpecList;
|
||||
};
|
||||
|
||||
|
||||
@@ -138,25 +134,11 @@ public:
|
||||
Declarator(DeclaratorKind dk, SourcePos p);
|
||||
|
||||
/** Once a DeclSpecs instance is available, this method completes the
|
||||
initialization of the Symbol, setting its Type accordingly.
|
||||
initialization of the type member.
|
||||
*/
|
||||
void InitFromDeclSpecs(DeclSpecs *ds);
|
||||
|
||||
/** Get the actual type of the combination of Declarator and the given
|
||||
DeclSpecs. If an explicit base type is provided, the declarator is
|
||||
applied to that type; otherwise the base type from the DeclSpecs is
|
||||
used. */
|
||||
const Type *GetType(DeclSpecs *ds) const;
|
||||
const Type *GetType(const Type *base, DeclSpecs *ds) const;
|
||||
|
||||
/** Returns the symbol corresponding to the function declared by this
|
||||
declarator and symbols for its arguments in *args. */
|
||||
Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
|
||||
|
||||
Symbol *GetSymbolForFunctionParameter(int paramNum) const;
|
||||
|
||||
/** Returns the symbol associated with the declarator. */
|
||||
Symbol *GetSymbol() const;
|
||||
void InitFromType(const Type *base, DeclSpecs *ds);
|
||||
|
||||
void Print(int indent) const;
|
||||
|
||||
@@ -177,18 +159,24 @@ public:
|
||||
/** Type qualifiers provided with the declarator. */
|
||||
int typeQualifiers;
|
||||
|
||||
StorageClass storageClass;
|
||||
|
||||
/** For array declarators, this gives the declared size of the array.
|
||||
Unsized arrays have arraySize == 0. */
|
||||
int arraySize;
|
||||
|
||||
/** Symbol associated with the declarator. */
|
||||
Symbol *sym;
|
||||
/** Name associated with the declarator. */
|
||||
std::string name;
|
||||
|
||||
/** Initialization expression for the variable. May be NULL. */
|
||||
Expr *initExpr;
|
||||
|
||||
/** Type of the declarator. This is NULL until InitFromDeclSpecs() or
|
||||
InitFromType() is called. */
|
||||
const Type *type;
|
||||
|
||||
/** For function declarations, this holds the Declaration *s for the
|
||||
funciton's parameters. */
|
||||
function's parameters. */
|
||||
std::vector<Declaration *> functionParams;
|
||||
};
|
||||
|
||||
@@ -233,8 +221,8 @@ struct StructDeclaration {
|
||||
/** Given a set of StructDeclaration instances, this returns the types of
|
||||
the elements of the corresponding struct and their names. */
|
||||
extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions);
|
||||
llvm::SmallVector<const Type *, 8> *elementTypes,
|
||||
llvm::SmallVector<std::string, 8> *elementNames,
|
||||
llvm::SmallVector<SourcePos, 8> *elementPositions);
|
||||
|
||||
#endif // ISPC_DECL_H
|
||||
|
||||
@@ -1,3 +1,199 @@
|
||||
=== v1.3.0 === (29 June 2012)
|
||||
|
||||
This is a major new release of ispc, with support for more compilation
|
||||
targets and a number of additions to the language. As usual, the quality
|
||||
of generated code has also been improved in a number of cases and a number
|
||||
of small bugs have been fixed.
|
||||
|
||||
New targets:
|
||||
|
||||
* This release provides "beta" support for compiling to Intel® Xeon
|
||||
Phi™ processor, code named Knights Corner, the first processor in
|
||||
the Intel® Many Integrated Core Architecture. See
|
||||
http://ispc.github.com/ispc.html#compiling-for-the-intel-xeon-phi-architecture
|
||||
for more details on this support.
|
||||
|
||||
* This release also has an "avx1.1" target, which provides support for the
|
||||
new instructions in the Intel Ivy Bridge microarchitecutre.
|
||||
|
||||
New language features:
|
||||
|
||||
* The foreach_active statement allows iteration over the active program
|
||||
instances in a gang. (See
|
||||
http://ispc.github.com/ispc.html#iteration-over-active-program-instances-foreach-active)
|
||||
|
||||
* foreach_unique allows iterating over subsets of program instances in a
|
||||
gang that share the same value of a variable. (See
|
||||
http://ispc.github.com/ispc.html#iteration-over-unique-elements-foreach-unique)
|
||||
|
||||
* An "unmasked" function qualifier and statement in the language allow
|
||||
re-activating execution of all program instances in a gang. (See
|
||||
http://ispc.github.com/ispc.html#re-establishing-the-execution-mask
|
||||
|
||||
Standard library updates:
|
||||
|
||||
* The seed_rng() function has been modified to take a "varying" seed value
|
||||
when a varying RNGState is being initialized.
|
||||
|
||||
* An isnan() function has been added, to check for floating-point "not a
|
||||
number" values.
|
||||
|
||||
* The float_to_srgb8() routine does high performance conversion of
|
||||
floating-point color values to SRGB8 format.
|
||||
|
||||
Other changes:
|
||||
|
||||
* A number of bugfixes have been made for compiler crashes with malformed
|
||||
programs.
|
||||
|
||||
* Floating-point comparisons are now "unordered", so that any comparison
|
||||
where one of the operands is a "not a number" value returns false. (This
|
||||
matches standard IEEE floating-point behavior.)
|
||||
|
||||
* The code generated for 'break' statements in "varying" loops has been
|
||||
improved for some common cases.
|
||||
|
||||
* Compile time and compiler memory use have both been improved,
|
||||
particularly for large input programs.
|
||||
|
||||
* A nubmer of bugs have been fixed in the debugging information generated
|
||||
by the compiler when the "-g" command-line flag is used.
|
||||
|
||||
=== v1.2.2 === (20 April 2012)
|
||||
|
||||
This release includes a number of small additions to functionality and a
|
||||
number of bugfixes. New functionality includes:
|
||||
|
||||
* It's now possible to forward declare structures as in C/C++: "struct
|
||||
Foo;". After such a declaration, structs with pointers to "Foo" and
|
||||
functions that take pointers or references to Foo structs can be declared
|
||||
without the entire definition of Foo being available.
|
||||
|
||||
* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
|
||||
corresponding to the equivalent types in C.
|
||||
|
||||
* The standard library now provides atomic_swap*() and
|
||||
atomic_compare_exchange*() functions for void * types.
|
||||
|
||||
* The C++ backend has seen a number of improvements to the quality and
|
||||
readability of generated code.
|
||||
|
||||
A number of bugs have been fixed in this release as well. The most
|
||||
significant are:
|
||||
|
||||
* Fixed a bug where nested loops could cause a compiler crash in some
|
||||
circumstances (issues #240, and #229)
|
||||
|
||||
* Gathers could access invlaid mamory (and cause the program to crash) in
|
||||
some circumstances (#235)
|
||||
|
||||
* References to temporary values are now handled properly when passed to a
|
||||
function that takes a reference typed parameter.
|
||||
|
||||
* A case where incorrect code could be generated for compile-time-constant
|
||||
initializers has been fixed (#234).
|
||||
|
||||
=== v1.2.1 === (6 April 2012)
|
||||
|
||||
This release contains only minor new functionality and is mostly for many
|
||||
small bugfixes and improvements to error handling and error reporting.
|
||||
The new functionality that is present is:
|
||||
|
||||
* Significantly more efficient versions of the float / half conversion
|
||||
routines are now available in the standard library, thanks to Fabian
|
||||
Giesen.
|
||||
|
||||
* The last member of a struct can now be a zero-length array; this allows
|
||||
the trick of dynamically allocating enough storage for the struct and
|
||||
some number of array elements at the end of it.
|
||||
|
||||
Significant bugs fixed include:
|
||||
|
||||
* Issue #205: When a target ISA isn't specified, use the host system's
|
||||
capabilities to choose a target for which it will be able to run the
|
||||
generated code.
|
||||
|
||||
* Issues #215 and #217: Don't allocate storage for global variables that
|
||||
are declared "extern".
|
||||
|
||||
* Issue #197: Allow NULL as a default argument value in a function
|
||||
declaration.
|
||||
|
||||
* Issue #223: Fix bugs where taking the address of a function wouldn't work
|
||||
as expected.
|
||||
|
||||
* Issue #224: When there are overloaded variants of a function that take
|
||||
both reference and const reference parameters, give the non-const
|
||||
reference preference when matching values of that underlying type.
|
||||
|
||||
* Issue #225: An error is issed when a varying lvalue is assigned to a
|
||||
reference type (rather than crashing).
|
||||
|
||||
* Issue #193: Permit conversions from array types to void *, not just the
|
||||
pointer type of the underlying array element.
|
||||
|
||||
* Issue #199: Still evaluate expressions that are cast to (void).
|
||||
|
||||
The documentation has also been improved, with FAQs added to clarify some
|
||||
aspects of the ispc pointer model.
|
||||
|
||||
=== v1.2.0 === (20 March 2012)
|
||||
|
||||
This is a major new release of ispc, with a number of significant
|
||||
improvements to functionality, performance, and compiler robustness. It
|
||||
does, however, include three small changes to language syntax and semantics
|
||||
that may require changes to existing programs:
|
||||
|
||||
* Syntax for the "launch" keyword has been cleaned up; it's now no longer
|
||||
necessary to bracket the launched function call with angle brackets.
|
||||
(In other words, now use "launch foo();", rather than "launch < foo() >;".
|
||||
|
||||
* When using pointers, the pointed-to data type is now "uniform" by
|
||||
default. Use the varying keyword to specify varying pointed-to types when
|
||||
needed. (i.e. "float *ptr" is a varying pointer to uniform float data,
|
||||
whereas previously it was a varying pointer to varying float values.)
|
||||
Use "varying float *" to specify a varying pointer to varying float data,
|
||||
and so forth.
|
||||
|
||||
* The details of "uniform" and "varying" and how they interact with struct
|
||||
types have been cleaned up. Now, when a struct type is declared, if the
|
||||
struct elements don't have explicit "uniform" or "varying" qualifiers,
|
||||
they are said to have "unbound" variability. When a struct type is
|
||||
instantiated, any unbound variability elements inherit the variability of
|
||||
the parent struct type. See http://ispc.github.com/ispc.html#struct-types
|
||||
for more details.
|
||||
|
||||
ispc has a new language feature that makes it much easier to use the
|
||||
efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
|
||||
data. A new "soa<n>" qualifier can be applied to structure types to
|
||||
specify an n-wide SoA version of the corresponding type. Array indexing
|
||||
and pointer operations with arrays SoA types automatically handles the
|
||||
two-stage indexing calculation to access the data. See
|
||||
http://ispc.github.com/ispc.html#structure-of-array-types for more details.
|
||||
|
||||
For more efficient access of data that is still in "array of structures"
|
||||
(AoS) format, ispc has a new "memory coalescing" optimization that
|
||||
automatically detects series of strided loads and/or gathers that can be
|
||||
transformed into a more efficient set of vector loads and shuffles. A
|
||||
diagnostic is emitted when this optimization is successfully applied.
|
||||
|
||||
Smaller changes in this release:
|
||||
|
||||
* The standard library now provides memcpy(), memmove() and memset()
|
||||
functions, as well as single-precision asin() and acos() functions.
|
||||
|
||||
* -I can now be specified on the command-line to specify a search path for
|
||||
#include files.
|
||||
|
||||
* A number of improvements have been made to error reporting from the
|
||||
parser, and a number of cases where malformed programs could cause the
|
||||
compiler to crash have been fixed.
|
||||
|
||||
* A number of small improvements to the quality and performance of generated
|
||||
code have been made, including finding more cases where 32-bit addressing
|
||||
calculations can be safely done on 64-bit systems and generating better
|
||||
code for initializer expressions.
|
||||
|
||||
=== v1.1.4 === (4 February 2012)
|
||||
|
||||
There are two major bugfixes for Windows in this release. First, a number
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
for i in ispc perfguide faq; do
|
||||
rst2html.py --template=template.txt --link-stylesheet \
|
||||
rst2html --template=template.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css $i.rst > $i.html
|
||||
done
|
||||
|
||||
rst2html.py --template=template-perf.txt --link-stylesheet \
|
||||
rst2html --template=template-news.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css news.rst > news.html
|
||||
|
||||
rst2html --template=template-perf.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css perf.rst > perf.html
|
||||
|
||||
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
|
||||
|
||||
397
docs/faq.rst
397
docs/faq.rst
@@ -14,11 +14,24 @@ distribution.
|
||||
+ `Why are there multiple versions of exported ispc functions in the assembly output?`_
|
||||
+ `How can I more easily see gathers and scatters in generated assembly?`_
|
||||
|
||||
* Running The Compiler
|
||||
|
||||
+ `Why is it required to use one of the "generic" targets with C++ output?`_
|
||||
+ `Why won't the compiler generate an object file or assembly output with the "generic" targets?`_
|
||||
|
||||
* Language Details
|
||||
|
||||
+ `What is the difference between "int *foo" and "int foo[]"?`_
|
||||
+ `Why are pointed-to types "uniform" by default?`_
|
||||
+ `What am I getting an error about assigning a varying lvalue to a reference type?`_
|
||||
|
||||
* Interoperability
|
||||
|
||||
+ `How can I supply an initial execution mask in the call from the application?`_
|
||||
+ `How can I generate a single binary executable with support for multiple instruction sets?`_
|
||||
+ `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
|
||||
+ `Is it possible to inline ispc functions in C/C++ code?`_
|
||||
+ `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_
|
||||
|
||||
* Programming Techniques
|
||||
|
||||
@@ -26,6 +39,8 @@ distribution.
|
||||
+ `How can a gang of program instances generate variable amounts of output efficiently?`_
|
||||
+ `Is it possible to use ispc for explicit vector programming?`_
|
||||
+ `How can I debug my ispc programs using Valgrind?`_
|
||||
+ `foreach statements generate more complex assembly than I'd expect; what's going on?`_
|
||||
+ `How do I launch an individual task for each active program instance?`_
|
||||
|
||||
Understanding ispc's Output
|
||||
===========================
|
||||
@@ -212,6 +227,174 @@ easier to understand:
|
||||
jmp ___pseudo_scatter_base_offsets32_32 ## TAILCALL
|
||||
|
||||
|
||||
Running The Compiler
|
||||
====================
|
||||
|
||||
Why is it required to use one of the "generic" targets with C++ output?
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
The C++ output option transforms the provided ``ispc`` program source into
|
||||
C++ code where each basic operation in the program (addition, comparison,
|
||||
etc.) is represented as a function call to an as-yet-undefined function,
|
||||
chaining the results of these calls together to perform the required
|
||||
computations. It is then expected that the user will provide the
|
||||
implementation of these functions via a header file with ``inline``
|
||||
functions defined for each of these functions and then use a C++ compiler
|
||||
to generate a final object file. (Examples of these headers include
|
||||
``examples/intrinsics/sse4.h`` and ``examples/intrinsics/knc.h`` in the
|
||||
``ispc`` distribution.)
|
||||
|
||||
If a target other than one of the "generic" ones is used with C++ output,
|
||||
then the compiler will transform certain operations into particular code
|
||||
sequences that may not be desired for the actual final target; for example,
|
||||
SSE targets that don't have hardware "gather" instructions will transform a
|
||||
gather into a sequence of scalar load instructions. When this in turn is
|
||||
transformed to C++ code, the fact that the loads were originally a gather
|
||||
is lost, and the header file of function definitions wouldn't have a chance
|
||||
to map the "gather" to a target-specific operation, as the ``knc.h`` header
|
||||
does, for example. Thus, the "generic" targets exist to provide basic
|
||||
targets of various vector widths, without imposing any limitations on the
|
||||
final target's capabilities.
|
||||
|
||||
Why won't the compiler generate an object file or assembly output with the "generic" targets?
|
||||
---------------------------------------------------------------------------------------------
|
||||
|
||||
As described in the above FAQ entry, when compiling to the "generic"
|
||||
targets, ``ispc`` generates vector code for the source program that
|
||||
transforms every basic operation in the program (addition, comparison,
|
||||
etc.) into a separate function call.
|
||||
|
||||
While there is no fundamental reason that the compiler couldn't generate
|
||||
target-specific object code with a function call to an undefined function
|
||||
for each primitive operation, doing so wouldn't actually be useful in
|
||||
practice--providing definitions of these functions in a separate object
|
||||
file and actually performing function calls for each of them (versus
|
||||
turning them into inline function calls) would be a highly inefficient way
|
||||
to run the program.
|
||||
|
||||
Therefore, in the interests of encouraging the use of the system,
|
||||
these types of output are disallowed.
|
||||
|
||||
|
||||
Language Details
|
||||
================
|
||||
|
||||
What is the difference between "int \*foo" and "int foo[]"?
|
||||
-----------------------------------------------------------
|
||||
|
||||
In C and C++, declaring a function to take a parameter ``int *foo`` and
|
||||
``int foo[]`` results in the same type for the parameter. Both are
|
||||
pointers to integers. In ``ispc``, these are different types. The first
|
||||
one is a varying pointer to a uniform integer value in memory, while the
|
||||
second results in a uniform pointer to the start of an array of varying
|
||||
integer values in memory.
|
||||
|
||||
To understand why the first is a varying pointer to a uniform integer,
|
||||
first recall that types without explicit rate qualifiers (``uniform``,
|
||||
``varying``, or ``soa<>``) are ``varying`` by default. Second, recall from
|
||||
the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
|
||||
types without rate qualifiers are ``uniform`` by default. (This second
|
||||
rule is discussed further below, in `Why are pointed-to types "uniform" by
|
||||
default?`_.) The type of ``int *foo`` follows from these.
|
||||
|
||||
.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types
|
||||
|
||||
Conversely, in a function body, ``int foo[10]`` represents a declaration of
|
||||
a 10-element array of varying ``int`` values. In that we'd certainly like
|
||||
to be able to pass such an array to a function that takes a ``int []``
|
||||
parameter, the natural type for an ``int []`` parameter is a uniform
|
||||
pointer to varying integer values.
|
||||
|
||||
In terms of compatibility with C/C++, it's unfortunate that this
|
||||
distinction exists, though any other set of rules seems to introduce more
|
||||
awkwardness than this one. (Though we're interested to hear ideas to
|
||||
improve these rules!).
|
||||
|
||||
Why are pointed-to types "uniform" by default?
|
||||
----------------------------------------------
|
||||
|
||||
In ``ispc``, types without rate qualifiers are "varying" by default, but
|
||||
types pointed to by pointers without rate qualifiers are "uniform" by
|
||||
default. Why this difference?
|
||||
|
||||
::
|
||||
|
||||
int foo; // no rate qualifier, "varying int".
|
||||
uniform int *foo; // pointer type has no rate qualifier, pointed-to does.
|
||||
// "varying pointer to uniform int".
|
||||
int *foo; // neither pointer type nor pointed-to type ("int") have
|
||||
// rate qualifiers. Pointer type is varying by default,
|
||||
// pointed-to is uniform. "varying pointer to uniform int".
|
||||
varying int *foo; // varying pointer to varying int
|
||||
|
||||
The first rule, having types without rate qualifiers be varying by default,
|
||||
is a default that keeps the number of "uniform" or "varying" qualifiers in
|
||||
``ispc`` programs low. Most ``ispc`` programs use mostly "varying"
|
||||
variables, so this rule allows most variables to be declared without also
|
||||
requiring rate qualifiers.
|
||||
|
||||
On a related note, this rule allows many C/C++ functions to be used to
|
||||
define equivalent functions in the SPMD execution model that ``ispc``
|
||||
provides with little or no modification:
|
||||
|
||||
::
|
||||
|
||||
// scalar add in C/C++, SPMD/vector add in ispc
|
||||
int add(int a, int b) { return a + b; }
|
||||
|
||||
This motivation also explains why ``uniform int *foo`` represents a varying
|
||||
pointer; having pointers be varying by default if they don't have rate
|
||||
qualifiers similarly helps with porting code from C/C++ to ``ispc``.
|
||||
|
||||
The tricker issue is why pointed-to types are "uniform" by default. In our
|
||||
experience, data in memory that is accessed via pointers is most often
|
||||
uniform; this generally includes all data that has been allocated and
|
||||
initialized by the C/C++ application code. In practice, "varying" types are
|
||||
more generally (but not exclusively) used for local data in ``ispc``
|
||||
functions. Thus, making the pointed-to type uniform by default leads to
|
||||
more concise code for the most common cases.
|
||||
|
||||
|
||||
What am I getting an error about assigning a varying lvalue to a reference type?
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Given code like the following:
|
||||
|
||||
::
|
||||
|
||||
uniform float a[...];
|
||||
int index = ...;
|
||||
float &r = a[index];
|
||||
|
||||
``ispc`` issues the error "Initializer for reference-type variable "r" must
|
||||
have a uniform lvalue type.". The underlying issue stems from how
|
||||
references are represented in the code generated by ``ispc``. Recall that
|
||||
``ispc`` supports both uniform and varying pointer types--a uniform pointer
|
||||
points to the same location in memory for all program instances in the
|
||||
gang, while a varying pointer allows each program instance to have its own
|
||||
pointer value.
|
||||
|
||||
References are represented a pointer in the code generated by ``ispc``,
|
||||
though this is generally opaque to the user; in ``ispc``, they are
|
||||
specifically uniform pointers. This design decision was made so that given
|
||||
code like this:
|
||||
|
||||
::
|
||||
|
||||
extern void func(float &val);
|
||||
float foo = ...;
|
||||
func(foo);
|
||||
|
||||
Then the reference would be handled efficiently as a single pointer, rather
|
||||
than unnecessarily being turned into a gang-size of pointers.
|
||||
|
||||
However, an implication of this decision is that it's not possible for
|
||||
references to refer to completely different things for each of the program
|
||||
instances. (And hence the error that is issued). In cases where a unique
|
||||
per-program-instance pointer is needed, a varying pointer should be used
|
||||
instead of a reference.
|
||||
|
||||
|
||||
Interoperability
|
||||
================
|
||||
|
||||
@@ -346,6 +529,92 @@ In a similar fashion, it's possible to find out at run-time the value of
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
Is it possible to inline ispc functions in C/C++ code?
|
||||
------------------------------------------------------
|
||||
|
||||
If you're willing to use the ``clang`` C/C++ compiler that's part of the
|
||||
LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
|
||||
(and conversely, to inline C/C++ calls in ``ispc``). Doing so can provide
|
||||
performance advantages when calling out to short functions written in the
|
||||
"other" language. Note that you don't need to use ``clang`` to compile all
|
||||
of your C/C++ code, but only for the files where you want to be able to
|
||||
inline. In order to do this, you must have a full installation of LLVM
|
||||
version 3.0 or later, including the ``clang`` compiler.
|
||||
|
||||
The basic approach is to have the various compilers emit LLVM intermediate
|
||||
representation (IR) code and to then use tools from LLVM to link together
|
||||
the IR from the compilers and then re-optimize it, which gives the LLVM
|
||||
optimizer the opportunity to do additional inlining and cross-function
|
||||
optimizations. If you have source files ``foo.ispc`` and ``foo.cpp``,
|
||||
first emit LLVM IR:
|
||||
|
||||
::
|
||||
|
||||
ispc --emit-llvm -o foo_ispc.bc foo.ispc
|
||||
clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
|
||||
|
||||
Next, link the two IR files into a single file and run the LLVM optimizer
|
||||
on the result:
|
||||
|
||||
::
|
||||
|
||||
llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
|
||||
|
||||
And finally, generate a native object file:
|
||||
|
||||
::
|
||||
|
||||
llc -filetype=obj foo_opt.bc -o foo.o
|
||||
|
||||
This file can in turn be linked in with the rest of your object files when
|
||||
linking your applicaiton.
|
||||
|
||||
(Note that if you're using the AVX instruction set, you must provide the
|
||||
``-mattr=+avx`` flag to ``llc``.)
|
||||
|
||||
|
||||
Why is it illegal to pass "varying" values from C/C++ to ispc functions?
|
||||
------------------------------------------------------------------------
|
||||
|
||||
If any of the types in the parameter list to an exported function is
|
||||
"varying" (including recursively, and members of structure types, etc.),
|
||||
then ``ispc`` will issue an error and refuse to compile the function:
|
||||
|
||||
::
|
||||
|
||||
% echo "export int add(int x) { return ++x; }" | ispc
|
||||
<stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo"
|
||||
<stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function.
|
||||
|
||||
While there's no fundamental reason why this isn't possible, recall the
|
||||
definition of "varying" variables: they have one value for each program
|
||||
instance in the gang. As such, the number of values and amount of storage
|
||||
required to represent a varying variable depends on the gang size
|
||||
(i.e. ``programCount``), which can have different values depending on the
|
||||
compilation target.
|
||||
|
||||
``ispc`` therefore prohibits passing "varying" values between the
|
||||
application and the ``ispc`` program in order to prevent the
|
||||
application-side code from depending on a particular gang size, in order to
|
||||
encourage portability to different gang sizes. (A generally desirable
|
||||
programming practice.)
|
||||
|
||||
For cases where the size of data is actually fixed from the application
|
||||
side, the value can be passed via a pointer to a short ``uniform`` array,
|
||||
as follows:
|
||||
|
||||
::
|
||||
|
||||
export void add4(uniform int ptr[4]) {
|
||||
foreach (i = 0 ... 4)
|
||||
ptr[i]++;
|
||||
}
|
||||
|
||||
On the 4-wide SSE instruction set, this compiles to a single vector add
|
||||
instruction (and associated move instructions), while it still also
|
||||
efficiently computes the correct result on 8-wide AVX targets.
|
||||
|
||||
|
||||
Programming Techniques
|
||||
======================
|
||||
|
||||
@@ -480,3 +749,131 @@ you can use ``--target=sse4`` when compiling to run with ``valgrind``.
|
||||
Note that ``valgrind`` does not yet support programs that use the AVX
|
||||
instruction set.
|
||||
|
||||
foreach statements generate more complex assembly than I'd expect; what's going on?
|
||||
-----------------------------------------------------------------------------------
|
||||
|
||||
Given a simple ``foreach`` loop like the following:
|
||||
|
||||
::
|
||||
|
||||
void foo(uniform float a[], uniform int count) {
|
||||
foreach (i = 0 ... count)
|
||||
a[i] *= 2;
|
||||
}
|
||||
|
||||
|
||||
the ``ispc`` compiler generates approximately 40 instructions--why isn't
|
||||
the generated code simpler?
|
||||
|
||||
There are two main components to the code: one handles
|
||||
``programCount``-sized chunks of elements of the array, and the other
|
||||
handles any excess elements at the end of the array that don't completely
|
||||
fill a gang. The code for the main loop is essentially what one would
|
||||
expect: a vector of values are laoded from the array, the multiply is done,
|
||||
and the result is stored.
|
||||
|
||||
::
|
||||
|
||||
LBB0_2: ## %foreach_full_body
|
||||
movslq %edx, %rdx
|
||||
vmovups (%rdi,%rdx), %ymm1
|
||||
vmulps %ymm0, %ymm1, %ymm1
|
||||
vmovups %ymm1, (%rdi,%rdx)
|
||||
addl $32, %edx
|
||||
addl $8, %eax
|
||||
cmpl %ecx, %eax
|
||||
jl LBB0_2
|
||||
|
||||
|
||||
Then, there is a sequence of instructions that handles any additional
|
||||
elements at the end of the array. (These instructions don't execute if
|
||||
there aren't any left-over values to process, but they do lengthen the
|
||||
amount of generated code.)
|
||||
|
||||
::
|
||||
|
||||
## BB#4: ## %partial_inner_only
|
||||
vmovd %eax, %xmm0
|
||||
vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
vpermilps $0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
vextractf128 $1, %ymm0, %xmm3
|
||||
vmovd %esi, %xmm2
|
||||
vmovaps LCPI0_1(%rip), %ymm1
|
||||
vextractf128 $1, %ymm1, %xmm4
|
||||
vpaddd %xmm4, %xmm3, %xmm3
|
||||
# ....
|
||||
vmulps LCPI0_0(%rip), %ymm1, %ymm1
|
||||
vmaskmovps %ymm1, %ymm0, (%rdi,%rax)
|
||||
|
||||
|
||||
If you know that the number of elements to be processed will always be an
|
||||
exact multiple of the 8, 16, etc., then adding a simple assignment to
|
||||
``count`` like the one below gives the compiler enough information to be
|
||||
able to eliminate the code for the additional array elements.
|
||||
|
||||
::
|
||||
|
||||
void foo(uniform float a[], uniform int count) {
|
||||
// This assignment doesn't change the value of count
|
||||
// if it's a multiple of 16, but it gives the compiler
|
||||
// insight into this fact, allowing for simpler code to
|
||||
// be generated for the foreach loop.
|
||||
count = (count & ~(16-1));
|
||||
foreach (i = 0 ... count)
|
||||
a[i] *= 2;
|
||||
}
|
||||
|
||||
With this new version of ``foo()``, only the code for the first loop above
|
||||
is generated.
|
||||
|
||||
|
||||
How do I launch an individual task for each active program instance?
|
||||
--------------------------------------------------------------------
|
||||
|
||||
Recall from the `discussion of "launch" in the ispc User's Guide`_ that a
|
||||
``launch`` statement launches a single task corresponding to a single gang
|
||||
of executing program instances, where the indices of the active program
|
||||
instances are the same as were active when the ``launch`` statement
|
||||
executed.
|
||||
|
||||
.. _discussion of "launch" in the ispc User's Guide: ispc.html#task-parallelism-launch-and-sync-statements
|
||||
|
||||
In some situations, it's desirable to be able to launch an individual task
|
||||
for each executing program instance. For example, we might be performing
|
||||
an iterative computation where a subset of the program instances determine
|
||||
that an item they are responsible for requires additional processing.
|
||||
|
||||
::
|
||||
|
||||
bool itemNeedsMoreProcessing(int);
|
||||
int itemNum = ...;
|
||||
if (itemNeedsMoreProcessing(itemNum)) {
|
||||
// do additional work
|
||||
}
|
||||
|
||||
For performance reasons, it may be desirable to apply an entire gang's
|
||||
worth of comptuation to each item that needs additional processing;
|
||||
there may be available parallelism in this computation such that we'd like
|
||||
to process each of the items with SPMD computation.
|
||||
|
||||
In this case, the ``foreach_active`` and ``unmasked`` constructs can be
|
||||
applied together to accomplish this goal.
|
||||
|
||||
::
|
||||
|
||||
// do additional work
|
||||
task void doWork(uniform int index);
|
||||
foreach_active (index) {
|
||||
unmasked {
|
||||
launch doWork(extract(itemNum, index));
|
||||
}
|
||||
}
|
||||
|
||||
Recall that the body of the ``foreach_active`` loop runs once for each
|
||||
active program instance, with each active program instance's
|
||||
``programIndex`` value available in ``index`` in the above. In the loop,
|
||||
we can re-establish an "all on" execution mask, enabling execution in all
|
||||
of the program instances in the gang, such that execution in ``doWork()``
|
||||
starts with all instances running. (Alternatively, the ``unmasked`` block
|
||||
could be in the definition of ``doWork()``.)
|
||||
|
||||
|
||||
1618
docs/ispc.rst
1618
docs/ispc.rst
File diff suppressed because it is too large
Load Diff
71
docs/news.rst
Normal file
71
docs/news.rst
Normal file
@@ -0,0 +1,71 @@
|
||||
=========
|
||||
ispc News
|
||||
=========
|
||||
|
||||
ispc 1.3.0 is Released
|
||||
----------------------
|
||||
|
||||
A major new version of ``ispc`` has been released. In addition to a number
|
||||
of new language features, this release notably features initial support for
|
||||
compiling to the Intel Xeon Phi (Many Integrated Core) architecture.
|
||||
|
||||
ispc 1.2.1 is Released
|
||||
----------------------
|
||||
|
||||
This is a bugfix release, fixing approximately 20 bugs in the system and
|
||||
improving error handling and error reporting. New functionality includes
|
||||
very efficient float/half conversion routines thanks to Fabian
|
||||
Giesen. See the `1.2.1 release notes`_ for details.
|
||||
|
||||
.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
|
||||
|
||||
ispc 1.2.0 is Released
|
||||
-----------------------
|
||||
|
||||
A new major release was posted on March 20, 2012. This release includes
|
||||
significant new functionality for cleanly handling "structure of arrays"
|
||||
(SoA) data layout and a new model for how uniform and varying are handled
|
||||
with structure types.
|
||||
|
||||
Paper on ispc To Appear in InPar 2012
|
||||
-------------------------------------
|
||||
|
||||
A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
|
||||
CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
|
||||
the `InPar 2012`_ conference. This paper describes a number of the design
|
||||
features and key characteristics of the ``ispc`` implementation.
|
||||
|
||||
(© 2012 IEEE. Personal use of this material is permitted. Permission from
|
||||
IEEE must be obtained for all other uses, in any current or future media,
|
||||
including reprinting/republishing this material for advertising or
|
||||
promotional purposes, creating new collective works, for resale or
|
||||
redistribution to servers or lists, or reuse of any copyrighted component
|
||||
of this work in other works.).
|
||||
|
||||
.. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
|
||||
.. _InPar 2012: http://innovativeparallel.org/
|
||||
|
||||
ispc 1.1.4 is Released
|
||||
----------------------
|
||||
|
||||
On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
|
||||
include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
|
||||
programs, "local" atomic operations in the standard library, and a new
|
||||
scalar compilation target. See the `1.1.4 release notes`_ for details.
|
||||
|
||||
.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
|
||||
|
||||
|
||||
ispc 1.1.3 is Released
|
||||
----------------------
|
||||
|
||||
With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
|
||||
|
||||
This release includes fixes for two important performance related issues:
|
||||
the quality of code generated for "foreach" statements has been
|
||||
substantially improved, and performance regression with code for "gathers"
|
||||
that was introduced in v1.1.2 has been fixed in this release.
|
||||
|
||||
Thanks to Jean-Luc Duprat for a number of patches that improve support for
|
||||
building on various platforms, and to Pierre-Antoine Lacaze for patches so
|
||||
that ispc builds under MinGW.
|
||||
@@ -13,6 +13,7 @@ the most out of ``ispc`` in practice.
|
||||
+ `Improving Control Flow Coherence With "foreach_tiled"`_
|
||||
+ `Using Coherent Control Flow Constructs`_
|
||||
+ `Use "uniform" Whenever Appropriate`_
|
||||
+ `Use "Structure of Arrays" Layout When Possible`_
|
||||
|
||||
* `Tips and Techniques`_
|
||||
|
||||
@@ -20,6 +21,7 @@ the most out of ``ispc`` in practice.
|
||||
+ `Avoid 64-bit Addressing Calculations When Possible`_
|
||||
+ `Avoid Computation With 8 and 16-bit Integer Types`_
|
||||
+ `Implementing Reductions Efficiently`_
|
||||
+ `Using "foreach_active" Effectively`_
|
||||
+ `Using Low-level Vector Tricks`_
|
||||
+ `The "Fast math" Option`_
|
||||
+ `"inline" Aggressively`_
|
||||
@@ -247,6 +249,76 @@ but it's always best to provide the compiler with as much help as possible
|
||||
to understand the actual form of your computation.
|
||||
|
||||
|
||||
Use "Structure of Arrays" Layout When Possible
|
||||
----------------------------------------------
|
||||
|
||||
In general, memory access performance (for both reads and writes) is best
|
||||
when the running program instances access a contiguous region of memory; in
|
||||
this case efficient vector load and store instructions can often be used
|
||||
rather than gathers and scatters. As an example of this issue, consider an
|
||||
array of a simple point datatype laid out and accessed in conventional
|
||||
"array of structures" (AOS) layout:
|
||||
|
||||
::
|
||||
|
||||
struct Point { float x, y, z; };
|
||||
uniform Point pts[...];
|
||||
float v = pts[programIndex].x;
|
||||
|
||||
In the above code, the access to ``pts[programIndex].x`` accesses
|
||||
non-sequential memory locations, due to the ``y`` and ``z`` values between
|
||||
the desired ``x`` values in memory. A "gather" is required to get the
|
||||
value of ``v``, with a corresponding decrease in performance.
|
||||
|
||||
If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
|
||||
can be much more efficient:
|
||||
|
||||
::
|
||||
|
||||
struct Point8 { float x[8], y[8], z[8]; };
|
||||
uniform Point8 pts8[...];
|
||||
int majorIndex = programIndex / 8;
|
||||
int minorIndex = programIndex % 8;
|
||||
float v = pts8[majorIndex].x[minorIndex];
|
||||
|
||||
In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
|
||||
before 8 ``y`` values and then 8 ``z`` values. If the gang size is 8 or
|
||||
less, the access for ``v`` will have the same value of ``majorIndex`` for
|
||||
all program instances and will access consecutive elements of the ``x[8]``
|
||||
array with a vector load. (For larger gang sizes, two 8-wide vector loads
|
||||
would be issues, which is also quite efficient.)
|
||||
|
||||
However, the syntax in the above code is messy; accessing SOA data in this
|
||||
fashion is much less elegant than the corresponding code for accessing the
|
||||
data with AOS layout. The ``soa`` qualifier in ``ispc`` can be used to
|
||||
cause the corresponding transformation to be made to the ``Point`` type,
|
||||
while preserving the clean syntax for data access that comes with AOS
|
||||
layout:
|
||||
|
||||
::
|
||||
|
||||
soa<8> Point pts[...];
|
||||
float v = pts[programIndex].x;
|
||||
|
||||
Thanks to having SOA layout a first-class concept in the language's type
|
||||
system, it's easy to write functions that convert data between the
|
||||
layouts. For example, the ``aos_to_soa`` function below converts ``count``
|
||||
elements of the given ``Point`` type from AOS to 8-wide SOA layout. (It
|
||||
assumes that the caller has pre-allocated sufficient space in the
|
||||
``pts_soa`` output array.
|
||||
|
||||
::
|
||||
|
||||
void aos_to_soa(uniform Point pts_aos[], uniform int count,
|
||||
soa<8> pts_soa[]) {
|
||||
foreach (i = 0 ... count)
|
||||
pts_soa[i] = pts_aos[i];
|
||||
}
|
||||
|
||||
Analogously, a function could be written to convert back from SOA to AOS if
|
||||
needed.
|
||||
|
||||
|
||||
Tips and Techniques
|
||||
===================
|
||||
|
||||
@@ -339,6 +411,12 @@ based on the index, it can be worth doing. See the example
|
||||
``examples/volume_rendering`` in the ``ispc`` distribution for the use of
|
||||
this technique in an instance where it is beneficial to performance.
|
||||
|
||||
Understanding Memory Read Coalescing
|
||||
------------------------------------
|
||||
|
||||
XXXX todo
|
||||
|
||||
|
||||
Avoid 64-bit Addressing Calculations When Possible
|
||||
--------------------------------------------------
|
||||
|
||||
@@ -433,6 +511,43 @@ values--very efficient code in the end.
|
||||
return reduce_add(sum);
|
||||
}
|
||||
|
||||
Using "foreach_active" Effectively
|
||||
----------------------------------
|
||||
|
||||
For high-performance code,
|
||||
|
||||
For example, consider this segment of code, from the introduction of
|
||||
``foreach_active`` in the ispc User's Guide:
|
||||
|
||||
::
|
||||
|
||||
uniform float array[...] = { ... };
|
||||
int index = ...;
|
||||
foreach_active (i) {
|
||||
++array[index];
|
||||
}
|
||||
|
||||
Here, ``index`` was assumed to possibly have the same value for multiple
|
||||
program instances, so the updates to ``array[index]`` are serialized by the
|
||||
``foreach_active`` statement in order to not have undefined results when
|
||||
``index`` values do collide.
|
||||
|
||||
The code generated by the compiler can be improved in this case by making
|
||||
it clear that only a single element of the array is accessed by
|
||||
``array[index]`` and that thus a general gather or scatter isn't required.
|
||||
Specifically, by using the ``extract()`` function from the standard library
|
||||
to extract the current program instance's value of ``index`` into a
|
||||
``uniform`` variable and then using that to index into ``array``, as below,
|
||||
more efficient code is generated.
|
||||
|
||||
::
|
||||
|
||||
foreach_active (instanceNum) {
|
||||
uniform int unifIndex = extract(index, instanceNum);
|
||||
++array[unifIndex];
|
||||
}
|
||||
|
||||
|
||||
Using Low-level Vector Tricks
|
||||
-----------------------------
|
||||
|
||||
@@ -547,7 +662,7 @@ gathers happen.)
|
||||
|
||||
extern "C" {
|
||||
void ISPCInstrument(const char *fn, const char *note,
|
||||
int line, int mask);
|
||||
int line, uint64_t mask);
|
||||
}
|
||||
|
||||
This function is passed the file name of the ``ispc`` file running, a short
|
||||
@@ -560,7 +675,7 @@ as follows:
|
||||
|
||||
::
|
||||
|
||||
ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
|
||||
ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
|
||||
|
||||
This call indicates that at the currently executing program has just
|
||||
entered the function defined at line 55 of the file ``foo.ispc``, with a
|
||||
|
||||
66
docs/template-news.txt
Normal file
66
docs/template-news.txt
Normal file
@@ -0,0 +1,66 @@
|
||||
%(head_prefix)s
|
||||
%(head)s
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
%(stylesheet)s
|
||||
%(body_prefix)s
|
||||
<div id="wrap">
|
||||
<div id="wrap2">
|
||||
<div id="header">
|
||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
||||
the CPU</div>
|
||||
</div>
|
||||
<div id="nav">
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li id="selected"><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li><a href="documentation.html">Documentation</a></li>
|
||||
<li><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div id="content-wrap">
|
||||
<div id="sidebar">
|
||||
<div class="widgetspace">
|
||||
<h1>Resources</h1>
|
||||
<ul class="menu">
|
||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
||||
users mailing list</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%(body_pre_docinfo)s
|
||||
%(docinfo)s
|
||||
<div id="content">
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
%(body_suffix)s
|
||||
@@ -26,10 +26,12 @@
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li><a href="documentation.html">Documentation</a></li>
|
||||
<li id="selected"><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
@@ -55,7 +57,7 @@
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -26,10 +26,12 @@
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li id="selected"><a href="documentation.html">Documentation</a></li>
|
||||
<li><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
@@ -55,7 +57,7 @@
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.1.4
|
||||
PROJECT_NUMBER = 1.3.0
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
|
||||
@@ -39,9 +39,6 @@ example implementation of this function that counts the number of times the
|
||||
callback is made and records some statistics about control flow coherence
|
||||
is provided in the instrument.cpp file.
|
||||
|
||||
*** Note: on Linux, this example currently hits an assertion in LLVM during
|
||||
*** compilation
|
||||
|
||||
|
||||
Deferred
|
||||
========
|
||||
@@ -76,6 +73,14 @@ This directory includes three implementations of the algorithm:
|
||||
light culling and shading.
|
||||
|
||||
|
||||
GMRES
|
||||
=====
|
||||
|
||||
An implementation of the generalized minimal residual method for solving
|
||||
sparse matrix equations.
|
||||
(http://en.wikipedia.org/wiki/Generalized_minimal_residual_method)
|
||||
|
||||
|
||||
Mandelbrot
|
||||
==========
|
||||
|
||||
@@ -110,6 +115,13 @@ This program implements both the Black-Scholes and Binomial options pricing
|
||||
models in both ispc and regular serial C++ code.
|
||||
|
||||
|
||||
Perfbench
|
||||
=========
|
||||
|
||||
This runs a number of microbenchmarks to measure system performance and
|
||||
code generation quality.
|
||||
|
||||
|
||||
RT
|
||||
==
|
||||
|
||||
|
||||
@@ -50,7 +50,6 @@ struct Isect {
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
@@ -83,7 +82,7 @@ static inline void vnormalize(vec &v) {
|
||||
|
||||
|
||||
static void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
@@ -103,7 +102,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
@@ -148,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
|
||||
|
||||
|
||||
static float
|
||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
@@ -204,14 +203,14 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static Sphere spheres[3] = {
|
||||
static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static uniform Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, y0);
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
|
||||
foreach_tiled(y = y0 ... y1, x = 0 ... w,
|
||||
@@ -269,5 +268,5 @@ static void task ao_task(uniform int width, uniform int height,
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
||||
launch[h] ao_task(w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
@@ -211,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, y0);
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
|
||||
// Compute the mapping between the 'programCount'-wide program
|
||||
// instances running in parallel and samples in the image.
|
||||
@@ -329,5 +329,5 @@ static void task ao_task(uniform int width, uniform int height,
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
||||
launch[h] ao_task(w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
@@ -1,16 +1,22 @@
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=tasksys.o
|
||||
TASK_OBJ=objs/tasksys.o
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O2 -m64
|
||||
CC=gcc
|
||||
CCFLAGS=-Iobjs/ -O2 -m64
|
||||
|
||||
LIBS=-lm $(TASK_LIB) -lstdc++
|
||||
ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
|
||||
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
|
||||
$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
|
||||
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
|
||||
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
|
||||
|
||||
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
|
||||
CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
|
||||
OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
|
||||
|
||||
default: $(EXAMPLE)
|
||||
|
||||
@@ -26,12 +32,15 @@ objs/%.cpp objs/%.o objs/%.h: dirs
|
||||
clean:
|
||||
/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
|
||||
|
||||
$(EXAMPLE): $(CPP_OBJS) $(ISPC_OBJS)
|
||||
$(EXAMPLE): $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/%.o: %.cpp dirs $(ISPC_HEADER)
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: %.c dirs $(ISPC_HEADER)
|
||||
$(CC) $< $(CCFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
|
||||
@@ -204,6 +204,7 @@ void WriteFrame(const char *filename, const InputData *input,
|
||||
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
fwrite(framebufferAOS, imageBytes, 1, out);
|
||||
fclose(out);
|
||||
|
||||
lAlignedFree(framebufferAOS);
|
||||
}
|
||||
|
||||
@@ -35,35 +35,35 @@
|
||||
|
||||
struct InputDataArrays
|
||||
{
|
||||
uniform float * uniform zBuffer;
|
||||
uniform unsigned int16 * uniform normalEncoded_x; // half float
|
||||
uniform unsigned int16 * uniform normalEncoded_y; // half float
|
||||
uniform unsigned int16 * uniform specularAmount; // half float
|
||||
uniform unsigned int16 * uniform specularPower; // half float
|
||||
uniform unsigned int8 * uniform albedo_x; // unorm8
|
||||
uniform unsigned int8 * uniform albedo_y; // unorm8
|
||||
uniform unsigned int8 * uniform albedo_z; // unorm8
|
||||
uniform float * uniform lightPositionView_x;
|
||||
uniform float * uniform lightPositionView_y;
|
||||
uniform float * uniform lightPositionView_z;
|
||||
uniform float * uniform lightAttenuationBegin;
|
||||
uniform float * uniform lightColor_x;
|
||||
uniform float * uniform lightColor_y;
|
||||
uniform float * uniform lightColor_z;
|
||||
uniform float * uniform lightAttenuationEnd;
|
||||
float *zBuffer;
|
||||
unsigned int16 *normalEncoded_x; // half float
|
||||
unsigned int16 *normalEncoded_y; // half float
|
||||
unsigned int16 *specularAmount; // half float
|
||||
unsigned int16 *specularPower; // half float
|
||||
unsigned int8 *albedo_x; // unorm8
|
||||
unsigned int8 *albedo_y; // unorm8
|
||||
unsigned int8 *albedo_z; // unorm8
|
||||
float *lightPositionView_x;
|
||||
float *lightPositionView_y;
|
||||
float *lightPositionView_z;
|
||||
float *lightAttenuationBegin;
|
||||
float *lightColor_x;
|
||||
float *lightColor_y;
|
||||
float *lightColor_z;
|
||||
float *lightAttenuationEnd;
|
||||
};
|
||||
|
||||
struct InputHeader
|
||||
{
|
||||
uniform float cameraProj[4][4];
|
||||
uniform float cameraNear;
|
||||
uniform float cameraFar;
|
||||
float cameraProj[4][4];
|
||||
float cameraNear;
|
||||
float cameraFar;
|
||||
|
||||
uniform int32 framebufferWidth;
|
||||
uniform int32 framebufferHeight;
|
||||
uniform int32 numLights;
|
||||
uniform int32 inputDataChunkSize;
|
||||
uniform int32 inputDataArrayOffsets[idaNum];
|
||||
int32 framebufferWidth;
|
||||
int32 framebufferHeight;
|
||||
int32 numLights;
|
||||
int32 inputDataChunkSize;
|
||||
int32 inputDataArrayOffsets[idaNum];
|
||||
};
|
||||
|
||||
|
||||
@@ -327,8 +327,8 @@ ShadeTile(
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
@@ -339,9 +339,9 @@ ShadeTile(
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
half_to_float(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
half_to_float(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
@@ -514,9 +514,9 @@ RenderStatic(uniform InputHeader &inputHeader,
|
||||
|
||||
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||
// by MIN_TILE_HEIGHT pixels.
|
||||
launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||
launch[num_groups] RenderTile(num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
}
|
||||
|
||||
|
||||
@@ -575,8 +575,6 @@ SplitTileMinMax(
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
|
||||
// indexing math ourselves
|
||||
uniform int32 subtileIndices[],
|
||||
uniform int32 subtileIndicesPitch,
|
||||
uniform int32 subtileNumLights[]
|
||||
|
||||
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
ispc::RenderStatic(&input->header, &input->arrays,
|
||||
ispc::RenderStatic(input->header, input->arrays,
|
||||
VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer.r, framebuffer.g, framebuffer.b);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
|
||||
@@ -23,6 +23,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
@@ -119,6 +121,14 @@ Global
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
||||
8
examples/gmres/Makefile
Normal file
8
examples/gmres/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=gmres
|
||||
CPP_SRC=algorithm.cpp main.cpp matrix.cpp
|
||||
CC_SRC=mmio.c
|
||||
ISPC_SRC=matrix.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
|
||||
include ../common.mk
|
||||
231
examples/gmres/algorithm.cpp
Normal file
231
examples/gmres/algorithm.cpp
Normal file
@@ -0,0 +1,231 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/*===========================================================================*\
|
||||
|* Includes
|
||||
\*===========================================================================*/
|
||||
#include "algorithm.h"
|
||||
#include "stdio.h"
|
||||
#include "debug.h"
|
||||
|
||||
|
||||
/*===========================================================================*\
|
||||
|* GMRES
|
||||
\*===========================================================================*/
|
||||
/* upper_triangular_right_solve:
|
||||
* ----------------------------
|
||||
* Given upper triangular matrix R and rhs vector b, solve for
|
||||
* x. This "solve" ignores the rows, columns of R that are greater than the
|
||||
* dimensions of x.
|
||||
*/
|
||||
void upper_triangular_right_solve (const DenseMatrix &R, const Vector &b, Vector &x)
|
||||
{
|
||||
// Dimensionality check
|
||||
ASSERT(R.rows() >= b.size());
|
||||
ASSERT(R.cols() >= x.size());
|
||||
ASSERT(b.size() >= x.size());
|
||||
|
||||
int max_row = x.size() - 1;
|
||||
|
||||
// first solve step:
|
||||
x[max_row] = b[max_row] / R(max_row, max_row);
|
||||
|
||||
for (int row = max_row - 1; row >= 0; row--) {
|
||||
double xi = b[row];
|
||||
for (int col = max_row; col > row; col--)
|
||||
xi -= x[col] * R(row, col);
|
||||
x[row] = xi / R(row, row);
|
||||
}
|
||||
}
|
||||
|
||||
/* create_rotation (used in gmres):
|
||||
* -------------------------------
|
||||
* Construct a Givens rotation to zero out the lowest non-zero entry in a partially
|
||||
* factored Hessenburg matrix. Note that the previous Givens rotations should be
|
||||
* applied to this column before creating a new rotation.
|
||||
*/
|
||||
void create_rotation (const DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
double a = H(col, col);
|
||||
double b = H(col + 1, col);
|
||||
double r;
|
||||
|
||||
if (b == 0) {
|
||||
Cn[col] = copysign(1, a);
|
||||
Sn[col] = 0;
|
||||
}
|
||||
else if (a == 0) {
|
||||
Cn[col] = 0;
|
||||
Sn[col] = copysign(1, b);
|
||||
}
|
||||
else {
|
||||
r = sqrt(a*a + b*b);
|
||||
Sn[col] = -b / r;
|
||||
Cn[col] = a / r;
|
||||
}
|
||||
}
|
||||
|
||||
/* Applies the 'col'th Givens rotation stored in vectors Sn and Cn to the 'col'th
|
||||
* column of the DenseMatrix M. (Previous columns don't need the rotation applied b/c
|
||||
* presumeably, the first col-1 columns are already upper triangular, and so their
|
||||
* entries in the col and col+1 rows are 0.)
|
||||
*/
|
||||
void apply_rotation (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
double c = Cn[col];
|
||||
double s = Sn[col];
|
||||
double tmp = c * H(col, col) - s * H(col+1, col);
|
||||
H(col+1, col) = s * H(col, col) + c * H(col+1, col);
|
||||
H(col, col) = tmp;
|
||||
}
|
||||
|
||||
/* Applies the 'col'th Givens rotation to the vector.
|
||||
*/
|
||||
void apply_rotation (Vector &v, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
double a = v[col];
|
||||
double b = v[col + 1];
|
||||
|
||||
double c = Cn[col];
|
||||
double s = Sn[col];
|
||||
|
||||
v[col] = c * a - s * b;
|
||||
v[col + 1] = s * a + c * b;
|
||||
}
|
||||
|
||||
/* Applies the first 'col' Givens rotations to the newly-created column
|
||||
* of H. (Leaves other columns alone.)
|
||||
*/
|
||||
void update_column (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
for (int i = 0; i < col; i++) {
|
||||
double c = Cn[i];
|
||||
double s = Sn[i];
|
||||
double t = c * H(i,col) - s * H(i+1,col);
|
||||
H(i+1, col) = s * H(i,col) + c * H(i+1,col);
|
||||
H(i, col) = t;
|
||||
}
|
||||
}
|
||||
|
||||
/* After a new column has been added to the hessenburg matrix, factor it back into
|
||||
* an upper-triangular matrix by:
|
||||
* - applying the previous Givens rotations to the new column
|
||||
* - computing the new Givens rotation to make the column upper triangluar
|
||||
* - applying the new Givens rotation to the column, and
|
||||
* - applying the new Givens rotation to the solution vector
|
||||
*/
|
||||
void update_qr_decomp (DenseMatrix &H, Vector &s, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
update_column( H, col, Cn, Sn);
|
||||
create_rotation(H, col, Cn, Sn);
|
||||
apply_rotation( H, col, Cn, Sn);
|
||||
apply_rotation( s, col, Cn, Sn);
|
||||
}
|
||||
|
||||
void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double max_err)
|
||||
{
|
||||
DEBUG_PRINT("gmres starting!\n");
|
||||
x.zero();
|
||||
|
||||
ASSERT(A.rows() == A.cols());
|
||||
DenseMatrix Qstar(num_iters + 1, A.rows());
|
||||
DenseMatrix H(num_iters + 1, num_iters);
|
||||
|
||||
// arrays for storing parameters of givens rotations
|
||||
Vector Sn(num_iters);
|
||||
Vector Cn(num_iters);
|
||||
|
||||
// array for storing the rhs projected onto the hessenburg's column space
|
||||
Vector G(num_iters+1);
|
||||
G.zero();
|
||||
|
||||
double beta = b.norm();
|
||||
G[0] = beta;
|
||||
|
||||
// temp vector, stores Aqi
|
||||
Vector w(A.rows());
|
||||
|
||||
w.copy(b);
|
||||
w.normalize();
|
||||
Qstar.set_row(0, w);
|
||||
|
||||
int iter = 0;
|
||||
Vector temp(A.rows(), false);
|
||||
double rel_err;
|
||||
|
||||
while (iter < num_iters)
|
||||
{
|
||||
// w = Aqi
|
||||
Qstar.row(iter, temp);
|
||||
A.multiply(temp, w);
|
||||
|
||||
// construct ith column of H, i+1th row of Qstar:
|
||||
for (int row = 0; row <= iter; row++) {
|
||||
Qstar.row(row, temp);
|
||||
H(row, iter) = temp.dot(w);
|
||||
w.add_ax(-H(row, iter), temp);
|
||||
}
|
||||
|
||||
H(iter+1, iter) = w.norm();
|
||||
w.divide(H(iter+1, iter));
|
||||
Qstar.set_row(iter+1, w);
|
||||
|
||||
update_qr_decomp (H, G, iter, Cn, Sn);
|
||||
|
||||
rel_err = fabs(G[iter+1] / beta);
|
||||
|
||||
if (rel_err < max_err)
|
||||
break;
|
||||
|
||||
if (iter % 100 == 0)
|
||||
DEBUG_PRINT("Iter %d: %f err\n", iter, rel_err);
|
||||
|
||||
iter++;
|
||||
}
|
||||
|
||||
if (iter == num_iters) {
|
||||
fprintf(stderr, "Error: gmres failed to converge in %d iterations (relative err: %f)\n", num_iters, rel_err);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// We've reached an acceptable solution (?):
|
||||
|
||||
DEBUG_PRINT("gmres completed in %d iterations (rel. resid. %f, max %f)\n", num_iters, rel_err, max_err);
|
||||
Vector y(iter+1);
|
||||
upper_triangular_right_solve(H, G, y);
|
||||
for (int i = 0; i < iter + 1; i++) {
|
||||
Qstar.row(i, temp);
|
||||
x.add_ax(y[i], temp);
|
||||
}
|
||||
}
|
||||
50
examples/gmres/algorithm.h
Normal file
50
examples/gmres/algorithm.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __ALGORITHM_H__
|
||||
#define __ALGORITHM_H__
|
||||
|
||||
#include "matrix.h"
|
||||
|
||||
|
||||
/* Generalized Minimal Residual Method:
|
||||
* -----------------------------------
|
||||
* Takes a square matrix and an rhs and uses GMRES to find an estimate for x.
|
||||
* The specified error is relative.
|
||||
*/
|
||||
void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double err);
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
8671
examples/gmres/data/c-18/c-18.mtx
Normal file
8671
examples/gmres/data/c-18/c-18.mtx
Normal file
File diff suppressed because it is too large
Load Diff
2176
examples/gmres/data/c-18/c-18_b.mtx
Normal file
2176
examples/gmres/data/c-18/c-18_b.mtx
Normal file
File diff suppressed because it is too large
Load Diff
17847
examples/gmres/data/c-21/c-21.mtx
Normal file
17847
examples/gmres/data/c-21/c-21.mtx
Normal file
File diff suppressed because it is too large
Load Diff
3516
examples/gmres/data/c-21/c-21_b.mtx
Normal file
3516
examples/gmres/data/c-21/c-21_b.mtx
Normal file
File diff suppressed because it is too large
Load Diff
16346
examples/gmres/data/c-22/c-22.mtx
Normal file
16346
examples/gmres/data/c-22/c-22.mtx
Normal file
File diff suppressed because it is too large
Load Diff
3799
examples/gmres/data/c-22/c-22_b.mtx
Normal file
3799
examples/gmres/data/c-22/c-22_b.mtx
Normal file
File diff suppressed because it is too large
Load Diff
26730
examples/gmres/data/c-25/c-25.mtx
Normal file
26730
examples/gmres/data/c-25/c-25.mtx
Normal file
File diff suppressed because it is too large
Load Diff
3804
examples/gmres/data/c-25/c-25_b.mtx
Normal file
3804
examples/gmres/data/c-25/c-25_b.mtx
Normal file
File diff suppressed because it is too large
Load Diff
55
examples/gmres/debug.h
Normal file
55
examples/gmres/debug.h
Normal file
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __DEBUG_H__
|
||||
#define __DEBUG_H__
|
||||
|
||||
#include <cassert>
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| Macros
|
||||
\**************************************************************/
|
||||
#define DEBUG
|
||||
|
||||
#ifdef DEBUG
|
||||
#define ASSERT(expr) assert(expr)
|
||||
#define DEBUG_PRINT(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define ASSERT(expr)
|
||||
#define DEBUG_PRINT(...)
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
79
examples/gmres/main.cpp
Normal file
79
examples/gmres/main.cpp
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#include "matrix.h"
|
||||
#include "algorithm.h"
|
||||
#include "util.h"
|
||||
#include <cmath>
|
||||
#include "../timing.h"
|
||||
|
||||
|
||||
int main (int argc, char **argv)
|
||||
{
|
||||
if (argc < 4) {
|
||||
printf("usage: %s <input-matrix> <input-rhs> <output-file>\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
double gmres_cycles;
|
||||
|
||||
DEBUG_PRINT("Loading A...\n");
|
||||
Matrix *A = CRSMatrix::matrix_from_mtf(argv[1]);
|
||||
if (A == NULL)
|
||||
return -1;
|
||||
DEBUG_PRINT("... size: %lu\n", A->cols());
|
||||
|
||||
DEBUG_PRINT("Loading b...\n");
|
||||
Vector *b = Vector::vector_from_mtf(argv[2]);
|
||||
if (b == NULL)
|
||||
return -1;
|
||||
|
||||
Vector x(A->cols());
|
||||
DEBUG_PRINT("Beginning gmres...\n");
|
||||
gmres(*A, *b, x, A->cols() / 2, .01);
|
||||
|
||||
// Write result out to file
|
||||
x.to_mtf(argv[argc-1]);
|
||||
|
||||
// Compute residual (double-check)
|
||||
#ifdef DEBUG
|
||||
Vector bprime(b->size());
|
||||
A->multiply(x, bprime);
|
||||
Vector resid(bprime.size(), &(bprime[0]));
|
||||
resid.subtract(*b);
|
||||
DEBUG_PRINT("residual error check: %lg\n", resid.norm() / b->norm());
|
||||
#endif
|
||||
// Print profiling results
|
||||
DEBUG_PRINT("-- Total mcycles to solve : %.03f --\n", gmres_cycles);
|
||||
}
|
||||
246
examples/gmres/matrix.cpp
Normal file
246
examples/gmres/matrix.cpp
Normal file
@@ -0,0 +1,246 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| Includes
|
||||
\**************************************************************/
|
||||
#include "matrix.h"
|
||||
#include "matrix_ispc.h"
|
||||
|
||||
extern "C" {
|
||||
#include "mmio.h"
|
||||
}
|
||||
|
||||
/**************************************************************\
|
||||
| DenseMatrix methods
|
||||
\**************************************************************/
|
||||
void DenseMatrix::multiply (const Vector &v, Vector &r) const
|
||||
{
|
||||
// Dimensionality check
|
||||
ASSERT(v.size() == cols());
|
||||
ASSERT(r.size() == rows());
|
||||
|
||||
for (int i = 0; i < rows(); i++)
|
||||
r[i] = v.dot(entries + i * num_cols);
|
||||
}
|
||||
|
||||
const Vector *DenseMatrix::row (size_t row) const {
|
||||
return new Vector(num_cols, entries + row * num_cols, true);
|
||||
}
|
||||
|
||||
void DenseMatrix::row (size_t row, Vector &r) {
|
||||
r.entries = entries + row * cols();
|
||||
r._size = cols();
|
||||
}
|
||||
|
||||
void DenseMatrix::set_row(size_t row, const Vector &v)
|
||||
{
|
||||
ASSERT(v.size() == num_cols);
|
||||
memcpy(entries + row * num_cols, v.entries, num_cols * sizeof(double));
|
||||
}
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| CRSMatrix Methods
|
||||
\**************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
struct entry {
|
||||
int row;
|
||||
int col;
|
||||
double val;
|
||||
};
|
||||
|
||||
bool compare_entries(struct entry i, struct entry j) {
|
||||
if (i.row < j.row)
|
||||
return true;
|
||||
if (i.row > j.row)
|
||||
return false;
|
||||
|
||||
return i.col < j.col;
|
||||
}
|
||||
|
||||
#define ERR_OUT(...) { fprintf(stderr, __VA_ARGS__); return NULL; }
|
||||
|
||||
CRSMatrix *CRSMatrix::matrix_from_mtf (char *path) {
|
||||
FILE *f;
|
||||
MM_typecode matcode;
|
||||
|
||||
int m, n, nz;
|
||||
|
||||
if ((f = fopen(path, "r")) == NULL)
|
||||
ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
|
||||
|
||||
if (mm_read_banner(f, &matcode) != 0)
|
||||
ERR_OUT("Error: Could not process Matrix Market banner.\n");
|
||||
|
||||
if (mm_is_complex(matcode))
|
||||
ERR_OUT("Error: Application does not support complex numbers.\n")
|
||||
|
||||
if (mm_is_dense(matcode))
|
||||
ERR_OUT("Error: supplied matrix is dense (should be sparse.)\n");
|
||||
|
||||
if (!mm_is_matrix(matcode))
|
||||
ERR_OUT("Error: %s does not encode a matrix.\n", path)
|
||||
|
||||
if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
|
||||
ERR_OUT("Error: could not read matrix size from file.\n");
|
||||
|
||||
if (m != n)
|
||||
ERR_OUT("Error: Application does not support non-square matrices.");
|
||||
|
||||
std::vector<struct entry> entries;
|
||||
entries.resize(nz);
|
||||
|
||||
for (int i = 0; i < nz; i++) {
|
||||
fscanf(f, "%d %d %lg\n", &entries[i].row, &entries[i].col, &entries[i].val);
|
||||
// Adjust from 1-based to 0-based
|
||||
entries[i].row--;
|
||||
entries[i].col--;
|
||||
}
|
||||
|
||||
sort(entries.begin(), entries.end(), compare_entries);
|
||||
|
||||
CRSMatrix *M = new CRSMatrix(m, n, nz);
|
||||
int cur_row = -1;
|
||||
for (int i = 0; i < nz; i++) {
|
||||
while (entries[i].row > cur_row)
|
||||
M->row_offsets[++cur_row] = i;
|
||||
M->entries[i] = entries[i].val;
|
||||
M->columns[i] = entries[i].col;
|
||||
}
|
||||
|
||||
return M;
|
||||
}
|
||||
|
||||
Vector *Vector::vector_from_mtf (char *path) {
|
||||
FILE *f;
|
||||
MM_typecode matcode;
|
||||
|
||||
int m, n, nz;
|
||||
|
||||
if ((f = fopen(path, "r")) == NULL)
|
||||
ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
|
||||
|
||||
if (mm_read_banner(f, &matcode) != 0)
|
||||
ERR_OUT("Error: Could not process Matrix Market banner.\n");
|
||||
|
||||
if (mm_is_complex(matcode))
|
||||
ERR_OUT("Error: Application does not support complex numbers.\n")
|
||||
|
||||
if (mm_is_dense(matcode)) {
|
||||
if (mm_read_mtx_array_size(f, &m, &n) != 0)
|
||||
ERR_OUT("Error: could not read matrix size from file.\n");
|
||||
} else {
|
||||
if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
|
||||
ERR_OUT("Error: could not read matrix size from file.\n");
|
||||
}
|
||||
if (n != 1)
|
||||
ERR_OUT("Error: %s does not describe a vector.\n", path);
|
||||
|
||||
Vector *x = new Vector(m);
|
||||
|
||||
if (mm_is_dense(matcode)) {
|
||||
double val;
|
||||
for (int i = 0; i < m; i++) {
|
||||
fscanf(f, "%lg\n", &val);
|
||||
(*x)[i] = val;
|
||||
}
|
||||
}
|
||||
else {
|
||||
x->zero();
|
||||
double val;
|
||||
int row;
|
||||
int col;
|
||||
for (int i = 0; i < nz; i++) {
|
||||
fscanf(f, "%d %d %lg\n", &row, &col, &val);
|
||||
(*x)[row-1] = val;
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
#define ERR(...) { fprintf(stderr, __VA_ARGS__); exit(-1); }
|
||||
|
||||
void Vector::to_mtf (char *path) {
|
||||
FILE *f;
|
||||
MM_typecode matcode;
|
||||
|
||||
mm_initialize_typecode(&matcode);
|
||||
mm_set_matrix(&matcode);
|
||||
mm_set_real(&matcode);
|
||||
mm_set_dense(&matcode);
|
||||
mm_set_general(&matcode);
|
||||
|
||||
if ((f = fopen(path, "w")) == NULL)
|
||||
ERR("Error: cannot open/write to %s\n", path);
|
||||
|
||||
mm_write_banner(f, matcode);
|
||||
mm_write_mtx_array_size(f, size(), 1);
|
||||
for (int i = 0; i < size(); i++)
|
||||
fprintf(f, "%lg\n", entries[i]);
|
||||
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
void CRSMatrix::multiply (const Vector &v, Vector &r) const
|
||||
{
|
||||
ASSERT(v.size() == cols());
|
||||
ASSERT(r.size() == rows());
|
||||
|
||||
for (int row = 0; row < rows(); row++)
|
||||
{
|
||||
int row_offset = row_offsets[row];
|
||||
int next_offset = ((row + 1 == rows()) ? _nonzeroes : row_offsets[row + 1]);
|
||||
|
||||
double sum = 0;
|
||||
for (int i = row_offset; i < next_offset; i++)
|
||||
{
|
||||
sum += v[columns[i]] * entries[i];
|
||||
}
|
||||
r[row] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
void CRSMatrix::zero ( )
|
||||
{
|
||||
entries.clear();
|
||||
row_offsets.clear();
|
||||
columns.clear();
|
||||
_nonzeroes = 0;
|
||||
}
|
||||
279
examples/gmres/matrix.h
Normal file
279
examples/gmres/matrix.h
Normal file
@@ -0,0 +1,279 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __MATRIX_H__
|
||||
#define __MATRIX_H__
|
||||
|
||||
/**************************************************************\
|
||||
| Includes
|
||||
\**************************************************************/
|
||||
#include <cstring> // size_t
|
||||
#include <cstdlib> // malloc, memcpy, etc.
|
||||
#include <cmath> // sqrt
|
||||
#include <vector>
|
||||
|
||||
#include "debug.h"
|
||||
#include "matrix_ispc.h"
|
||||
|
||||
|
||||
class DenseMatrix;
|
||||
/**************************************************************\
|
||||
| Vector class
|
||||
\**************************************************************/
|
||||
class Vector {
|
||||
public:
|
||||
static Vector *vector_from_mtf(char *path);
|
||||
void to_mtf (char *path);
|
||||
|
||||
Vector(size_t size, bool alloc_mem=true)
|
||||
{
|
||||
shared_ptr = false;
|
||||
_size = size;
|
||||
|
||||
if (alloc_mem)
|
||||
entries = (double *) malloc(sizeof(double) * _size);
|
||||
else {
|
||||
shared_ptr = true;
|
||||
entries = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
Vector(size_t size, double *content, bool share_ptr=false)
|
||||
{
|
||||
_size = size;
|
||||
if (share_ptr) {
|
||||
entries = content;
|
||||
shared_ptr = true;
|
||||
}
|
||||
else {
|
||||
shared_ptr = false;
|
||||
entries = (double *) malloc(sizeof(double) * _size);
|
||||
memcpy(entries, content, sizeof(double) * _size);
|
||||
}
|
||||
}
|
||||
|
||||
~Vector() { if (!shared_ptr) free(entries); }
|
||||
|
||||
const double & operator [] (size_t index) const
|
||||
{
|
||||
ASSERT(index < _size);
|
||||
return *(entries + index);
|
||||
}
|
||||
|
||||
double &operator [] (size_t index)
|
||||
{
|
||||
ASSERT(index < _size);
|
||||
return *(entries + index);
|
||||
}
|
||||
|
||||
bool operator == (const Vector &v) const
|
||||
{
|
||||
if (v.size() != _size)
|
||||
return false;
|
||||
|
||||
for (int i = 0; i < _size; i++)
|
||||
if (entries[i] != v[i])
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t size() const {return _size; }
|
||||
|
||||
double dot (const Vector &b) const
|
||||
{
|
||||
ASSERT(b.size() == this->size());
|
||||
return ispc::vector_dot(entries, b.entries, size());
|
||||
}
|
||||
|
||||
double dot (const double * const b) const
|
||||
{
|
||||
return ispc::vector_dot(entries, b, size());
|
||||
}
|
||||
|
||||
void zero ()
|
||||
{
|
||||
ispc::zero(entries, size());
|
||||
}
|
||||
|
||||
double norm () const { return sqrtf(dot(entries)); }
|
||||
|
||||
void normalize () { this->divide(this->norm()); }
|
||||
|
||||
void add (const Vector &a)
|
||||
{
|
||||
ASSERT(size() == a.size());
|
||||
ispc::vector_add(entries, a.entries, size());
|
||||
}
|
||||
|
||||
void subtract (const Vector &s)
|
||||
{
|
||||
ASSERT(size() == s.size());
|
||||
ispc::vector_sub(entries, s.entries, size());
|
||||
}
|
||||
|
||||
void multiply (double scalar)
|
||||
{
|
||||
ispc::vector_mult(entries, scalar, size());
|
||||
}
|
||||
|
||||
void divide (double scalar)
|
||||
{
|
||||
ispc::vector_div(entries, scalar, size());
|
||||
}
|
||||
|
||||
// Note: x may be longer than *(this)
|
||||
void add_ax (double a, const Vector &x) {
|
||||
ASSERT(x.size() >= size());
|
||||
ispc::vector_add_ax(entries, a, x.entries, size());
|
||||
}
|
||||
|
||||
// Note that copy only copies the first size() elements of the
|
||||
// supplied vector, i.e. the supplied vector can be longer than
|
||||
// this one. This is useful in least squares calculations.
|
||||
void copy (const Vector &other) {
|
||||
ASSERT(other.size() >= size());
|
||||
memcpy(entries, other.entries, size() * sizeof(double));
|
||||
}
|
||||
|
||||
friend class DenseMatrix;
|
||||
|
||||
private:
|
||||
size_t _size;
|
||||
bool shared_ptr;
|
||||
double *entries;
|
||||
};
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| Matrix base class
|
||||
\**************************************************************/
|
||||
class Matrix {
|
||||
friend class Vector;
|
||||
|
||||
public:
|
||||
Matrix(size_t size_r, size_t size_c)
|
||||
{
|
||||
num_rows = size_r;
|
||||
num_cols = size_c;
|
||||
}
|
||||
~Matrix(){}
|
||||
|
||||
size_t rows() const { return num_rows; }
|
||||
size_t cols() const { return num_cols; }
|
||||
|
||||
virtual void multiply (const Vector &v, Vector &r) const = 0;
|
||||
virtual void zero () = 0;
|
||||
|
||||
protected:
|
||||
size_t num_rows;
|
||||
size_t num_cols;
|
||||
};
|
||||
|
||||
/**************************************************************\
|
||||
| DenseMatrix class
|
||||
\**************************************************************/
|
||||
class DenseMatrix : public Matrix {
|
||||
friend class Vector;
|
||||
|
||||
public:
|
||||
DenseMatrix(size_t size_r, size_t size_c) : Matrix(size_r, size_c)
|
||||
{
|
||||
entries = (double *) malloc(size_r * size_c * sizeof(double));
|
||||
}
|
||||
|
||||
DenseMatrix(size_t size_r, size_t size_c, const double *content) : Matrix (size_r, size_c)
|
||||
{
|
||||
entries = (double *) malloc(size_r * size_c * sizeof(double));
|
||||
memcpy(entries, content, size_r * size_c * sizeof(double));
|
||||
}
|
||||
|
||||
virtual void multiply (const Vector &v, Vector &r) const;
|
||||
|
||||
double &operator () (unsigned int r, unsigned int c)
|
||||
{
|
||||
return *(entries + r * num_cols + c);
|
||||
}
|
||||
|
||||
const double &operator () (unsigned int r, unsigned int c) const
|
||||
{
|
||||
return *(entries + r * num_cols + c);
|
||||
}
|
||||
|
||||
const Vector *row(size_t row) const;
|
||||
void row(size_t row, Vector &r);
|
||||
void set_row(size_t row, const Vector &v);
|
||||
|
||||
virtual void zero() { ispc::zero(entries, rows() * cols()); }
|
||||
|
||||
void copy (const DenseMatrix &other)
|
||||
{
|
||||
ASSERT(rows() == other.rows());
|
||||
ASSERT(cols() == other.cols());
|
||||
memcpy(entries, other.entries, rows() * cols() * sizeof(double));
|
||||
}
|
||||
|
||||
private:
|
||||
double *entries;
|
||||
bool shared_ptr;
|
||||
};
|
||||
|
||||
/**************************************************************\
|
||||
| CSRMatrix (compressed row storage, a sparse matrix format)
|
||||
\**************************************************************/
|
||||
class CRSMatrix : public Matrix {
|
||||
public:
|
||||
CRSMatrix (size_t size_r, size_t size_c, size_t nonzeroes) :
|
||||
Matrix(size_r, size_c)
|
||||
{
|
||||
_nonzeroes = nonzeroes;
|
||||
entries.resize(nonzeroes);
|
||||
columns.resize(nonzeroes);
|
||||
row_offsets.resize(size_r);
|
||||
}
|
||||
|
||||
virtual void multiply(const Vector &v, Vector &r) const;
|
||||
|
||||
virtual void zero();
|
||||
|
||||
static CRSMatrix *matrix_from_mtf (char *path);
|
||||
|
||||
private:
|
||||
unsigned int _nonzeroes;
|
||||
std::vector<double> entries;
|
||||
std::vector<int> row_offsets;
|
||||
std::vector<int> columns;
|
||||
};
|
||||
|
||||
#endif
|
||||
122
examples/gmres/matrix.ispc
Normal file
122
examples/gmres/matrix.ispc
Normal file
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| General
|
||||
\**************************************************************/
|
||||
export void zero (uniform double data[],
|
||||
uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
data[i] = 0.0;
|
||||
}
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| Vector helpers
|
||||
\**************************************************************/
|
||||
export void vector_add (uniform double a[],
|
||||
const uniform double b[],
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
a[i] += b[i];
|
||||
}
|
||||
|
||||
export void vector_sub (uniform double a[],
|
||||
const uniform double b[],
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
a[i] -= b[i];
|
||||
}
|
||||
|
||||
export void vector_mult (uniform double a[],
|
||||
const uniform double b,
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
a[i] *= b;
|
||||
}
|
||||
|
||||
export void vector_div (uniform double a[],
|
||||
const uniform double b,
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
a[i] /= b;
|
||||
}
|
||||
|
||||
export void vector_add_ax (uniform double r[],
|
||||
const uniform double a,
|
||||
const uniform double x[],
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
r[i] += a * x[i];
|
||||
}
|
||||
|
||||
export uniform double vector_dot (const uniform double a[],
|
||||
const uniform double b[],
|
||||
const uniform int size)
|
||||
{
|
||||
varying double sum = 0.0;
|
||||
foreach (i = 0 ... size)
|
||||
sum += a[i] * b[i];
|
||||
return reduce_add(sum);
|
||||
}
|
||||
|
||||
/**************************************************************\
|
||||
| Matrix helpers
|
||||
\**************************************************************/
|
||||
export void sparse_multiply (const uniform double entries[],
|
||||
const uniform double columns[],
|
||||
const uniform double row_offsets[],
|
||||
const uniform int rows,
|
||||
const uniform int cols,
|
||||
const uniform int nonzeroes,
|
||||
const uniform double v[],
|
||||
uniform double r[])
|
||||
{
|
||||
foreach (row = 0 ... rows) {
|
||||
int row_offset = row_offsets[row];
|
||||
int next_offset = ((row + 1 == rows) ? nonzeroes : row_offsets[row+1]);
|
||||
|
||||
double sum = 0;
|
||||
for (int j = row_offset; j < next_offset; j++)
|
||||
sum += v[columns[j]] * entries[j];
|
||||
r[row] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
511
examples/gmres/mmio.c
Normal file
511
examples/gmres/mmio.c
Normal file
@@ -0,0 +1,511 @@
|
||||
/*
|
||||
* Matrix Market I/O library for ANSI C
|
||||
*
|
||||
* See http://math.nist.gov/MatrixMarket for details.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "mmio.h"
|
||||
|
||||
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
|
||||
double **val_, int **I_, int **J_)
|
||||
{
|
||||
FILE *f;
|
||||
MM_typecode matcode;
|
||||
int M, N, nz;
|
||||
int i;
|
||||
double *val;
|
||||
int *I, *J;
|
||||
|
||||
if ((f = fopen(fname, "r")) == NULL)
|
||||
return -1;
|
||||
|
||||
|
||||
if (mm_read_banner(f, &matcode) != 0)
|
||||
{
|
||||
printf("mm_read_unsymetric: Could not process Matrix Market banner ");
|
||||
printf(" in file [%s]\n", fname);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
|
||||
mm_is_sparse(matcode)))
|
||||
{
|
||||
fprintf(stderr, "Sorry, this application does not support ");
|
||||
fprintf(stderr, "Market Market type: [%s]\n",
|
||||
mm_typecode_to_str(matcode));
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* find out size of sparse matrix: M, N, nz .... */
|
||||
|
||||
if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
|
||||
{
|
||||
fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
*M_ = M;
|
||||
*N_ = N;
|
||||
*nz_ = nz;
|
||||
|
||||
/* reseve memory for matrices */
|
||||
|
||||
I = (int *) malloc(nz * sizeof(int));
|
||||
J = (int *) malloc(nz * sizeof(int));
|
||||
val = (double *) malloc(nz * sizeof(double));
|
||||
|
||||
*val_ = val;
|
||||
*I_ = I;
|
||||
*J_ = J;
|
||||
|
||||
/* NOTE: when reading in doubles, ANSI C requires the use of the "l" */
|
||||
/* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
|
||||
/* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */
|
||||
|
||||
for (i=0; i<nz; i++)
|
||||
{
|
||||
fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
|
||||
I[i]--; /* adjust from 1-based to 0-based */
|
||||
J[i]--;
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_is_valid(MM_typecode matcode)
|
||||
{
|
||||
if (!mm_is_matrix(matcode)) return 0;
|
||||
if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
|
||||
if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
|
||||
if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) ||
|
||||
mm_is_skew(matcode))) return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int mm_read_banner(FILE *f, MM_typecode *matcode)
|
||||
{
|
||||
char line[MM_MAX_LINE_LENGTH];
|
||||
char banner[MM_MAX_TOKEN_LENGTH];
|
||||
char mtx[MM_MAX_TOKEN_LENGTH];
|
||||
char crd[MM_MAX_TOKEN_LENGTH];
|
||||
char data_type[MM_MAX_TOKEN_LENGTH];
|
||||
char storage_scheme[MM_MAX_TOKEN_LENGTH];
|
||||
char *p;
|
||||
|
||||
|
||||
mm_clear_typecode(matcode);
|
||||
|
||||
if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
|
||||
return MM_PREMATURE_EOF;
|
||||
|
||||
if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type,
|
||||
storage_scheme) != 5)
|
||||
return MM_PREMATURE_EOF;
|
||||
|
||||
for (p=mtx; *p!='\0'; *p=tolower(*p),p++); /* convert to lower case */
|
||||
for (p=crd; *p!='\0'; *p=tolower(*p),p++);
|
||||
for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
|
||||
for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
|
||||
|
||||
/* check for banner */
|
||||
if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
|
||||
return MM_NO_HEADER;
|
||||
|
||||
/* first field should be "mtx" */
|
||||
if (strcmp(mtx, MM_MTX_STR) != 0)
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
mm_set_matrix(matcode);
|
||||
|
||||
|
||||
/* second field describes whether this is a sparse matrix (in coordinate
|
||||
storgae) or a dense array */
|
||||
|
||||
|
||||
if (strcmp(crd, MM_SPARSE_STR) == 0)
|
||||
mm_set_sparse(matcode);
|
||||
else
|
||||
if (strcmp(crd, MM_DENSE_STR) == 0)
|
||||
mm_set_dense(matcode);
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
|
||||
/* third field */
|
||||
|
||||
if (strcmp(data_type, MM_REAL_STR) == 0)
|
||||
mm_set_real(matcode);
|
||||
else
|
||||
if (strcmp(data_type, MM_COMPLEX_STR) == 0)
|
||||
mm_set_complex(matcode);
|
||||
else
|
||||
if (strcmp(data_type, MM_PATTERN_STR) == 0)
|
||||
mm_set_pattern(matcode);
|
||||
else
|
||||
if (strcmp(data_type, MM_INT_STR) == 0)
|
||||
mm_set_integer(matcode);
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
|
||||
/* fourth field */
|
||||
|
||||
if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
|
||||
mm_set_general(matcode);
|
||||
else
|
||||
if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
|
||||
mm_set_symmetric(matcode);
|
||||
else
|
||||
if (strcmp(storage_scheme, MM_HERM_STR) == 0)
|
||||
mm_set_hermitian(matcode);
|
||||
else
|
||||
if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
|
||||
mm_set_skew(matcode);
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
|
||||
{
|
||||
if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
|
||||
return MM_COULD_NOT_WRITE_FILE;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
|
||||
{
|
||||
char line[MM_MAX_LINE_LENGTH];
|
||||
int num_items_read;
|
||||
|
||||
/* set return null parameter values, in case we exit with errors */
|
||||
*M = *N = *nz = 0;
|
||||
|
||||
/* now continue scanning until you reach the end-of-comments */
|
||||
do
|
||||
{
|
||||
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
|
||||
return MM_PREMATURE_EOF;
|
||||
}while (line[0] == '%');
|
||||
|
||||
/* line[] is either blank or has M,N, nz */
|
||||
if (sscanf(line, "%d %d %d", M, N, nz) == 3)
|
||||
return 0;
|
||||
|
||||
else
|
||||
do
|
||||
{
|
||||
num_items_read = fscanf(f, "%d %d %d", M, N, nz);
|
||||
if (num_items_read == EOF) return MM_PREMATURE_EOF;
|
||||
}
|
||||
while (num_items_read != 3);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int mm_read_mtx_array_size(FILE *f, int *M, int *N)
|
||||
{
|
||||
char line[MM_MAX_LINE_LENGTH];
|
||||
int num_items_read;
|
||||
/* set return null parameter values, in case we exit with errors */
|
||||
*M = *N = 0;
|
||||
|
||||
/* now continue scanning until you reach the end-of-comments */
|
||||
do
|
||||
{
|
||||
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
|
||||
return MM_PREMATURE_EOF;
|
||||
}while (line[0] == '%');
|
||||
|
||||
/* line[] is either blank or has M,N, nz */
|
||||
if (sscanf(line, "%d %d", M, N) == 2)
|
||||
return 0;
|
||||
|
||||
else /* we have a blank line */
|
||||
do
|
||||
{
|
||||
num_items_read = fscanf(f, "%d %d", M, N);
|
||||
if (num_items_read == EOF) return MM_PREMATURE_EOF;
|
||||
}
|
||||
while (num_items_read != 2);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_write_mtx_array_size(FILE *f, int M, int N)
|
||||
{
|
||||
if (fprintf(f, "%d %d\n", M, N) != 2)
|
||||
return MM_COULD_NOT_WRITE_FILE;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*-------------------------------------------------------------------------*/
|
||||
|
||||
/******************************************************************/
|
||||
/* use when I[], J[], and val[]J, and val[] are already allocated */
|
||||
/******************************************************************/
|
||||
|
||||
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
|
||||
double val[], MM_typecode matcode)
|
||||
{
|
||||
int i;
|
||||
if (mm_is_complex(matcode))
|
||||
{
|
||||
for (i=0; i<nz; i++)
|
||||
if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
|
||||
!= 4) return MM_PREMATURE_EOF;
|
||||
}
|
||||
else if (mm_is_real(matcode))
|
||||
{
|
||||
for (i=0; i<nz; i++)
|
||||
{
|
||||
if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
|
||||
!= 3) return MM_PREMATURE_EOF;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
else if (mm_is_pattern(matcode))
|
||||
{
|
||||
for (i=0; i<nz; i++)
|
||||
if (fscanf(f, "%d %d", &I[i], &J[i])
|
||||
!= 2) return MM_PREMATURE_EOF;
|
||||
}
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
|
||||
double *real, double *imag, MM_typecode matcode)
|
||||
{
|
||||
if (mm_is_complex(matcode))
|
||||
{
|
||||
if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
|
||||
!= 4) return MM_PREMATURE_EOF;
|
||||
}
|
||||
else if (mm_is_real(matcode))
|
||||
{
|
||||
if (fscanf(f, "%d %d %lg\n", I, J, real)
|
||||
!= 3) return MM_PREMATURE_EOF;
|
||||
|
||||
}
|
||||
|
||||
else if (mm_is_pattern(matcode))
|
||||
{
|
||||
if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
|
||||
}
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/************************************************************************
|
||||
mm_read_mtx_crd() fills M, N, nz, array of values, and return
|
||||
type code, e.g. 'MCRS'
|
||||
|
||||
if matrix is complex, values[] is of size 2*nz,
|
||||
(nz pairs of real/imaginary values)
|
||||
************************************************************************/
|
||||
|
||||
int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
|
||||
double **val, MM_typecode *matcode)
|
||||
{
|
||||
int ret_code;
|
||||
FILE *f;
|
||||
|
||||
if (strcmp(fname, "stdin") == 0) f=stdin;
|
||||
else
|
||||
if ((f = fopen(fname, "r")) == NULL)
|
||||
return MM_COULD_NOT_READ_FILE;
|
||||
|
||||
|
||||
if ((ret_code = mm_read_banner(f, matcode)) != 0)
|
||||
return ret_code;
|
||||
|
||||
if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) &&
|
||||
mm_is_matrix(*matcode)))
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
|
||||
return ret_code;
|
||||
|
||||
|
||||
*I = (int *) malloc(*nz * sizeof(int));
|
||||
*J = (int *) malloc(*nz * sizeof(int));
|
||||
*val = NULL;
|
||||
|
||||
if (mm_is_complex(*matcode))
|
||||
{
|
||||
*val = (double *) malloc(*nz * 2 * sizeof(double));
|
||||
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
|
||||
*matcode);
|
||||
if (ret_code != 0) return ret_code;
|
||||
}
|
||||
else if (mm_is_real(*matcode))
|
||||
{
|
||||
*val = (double *) malloc(*nz * sizeof(double));
|
||||
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
|
||||
*matcode);
|
||||
if (ret_code != 0) return ret_code;
|
||||
}
|
||||
|
||||
else if (mm_is_pattern(*matcode))
|
||||
{
|
||||
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
|
||||
*matcode);
|
||||
if (ret_code != 0) return ret_code;
|
||||
}
|
||||
|
||||
if (f != stdin) fclose(f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_write_banner(FILE *f, MM_typecode matcode)
|
||||
{
|
||||
char *str = mm_typecode_to_str(matcode);
|
||||
int ret_code;
|
||||
|
||||
ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
|
||||
free(str);
|
||||
if (ret_code !=2 )
|
||||
return MM_COULD_NOT_WRITE_FILE;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
|
||||
double val[], MM_typecode matcode)
|
||||
{
|
||||
FILE *f;
|
||||
int i;
|
||||
|
||||
if (strcmp(fname, "stdout") == 0)
|
||||
f = stdout;
|
||||
else
|
||||
if ((f = fopen(fname, "w")) == NULL)
|
||||
return MM_COULD_NOT_WRITE_FILE;
|
||||
|
||||
/* print banner followed by typecode */
|
||||
fprintf(f, "%s ", MatrixMarketBanner);
|
||||
fprintf(f, "%s\n", mm_typecode_to_str(matcode));
|
||||
|
||||
/* print matrix sizes and nonzeros */
|
||||
fprintf(f, "%d %d %d\n", M, N, nz);
|
||||
|
||||
/* print values */
|
||||
if (mm_is_pattern(matcode))
|
||||
for (i=0; i<nz; i++)
|
||||
fprintf(f, "%d %d\n", I[i], J[i]);
|
||||
else
|
||||
if (mm_is_real(matcode))
|
||||
for (i=0; i<nz; i++)
|
||||
fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
|
||||
else
|
||||
if (mm_is_complex(matcode))
|
||||
for (i=0; i<nz; i++)
|
||||
fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i],
|
||||
val[2*i+1]);
|
||||
else
|
||||
{
|
||||
if (f != stdout) fclose(f);
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
}
|
||||
|
||||
if (f !=stdout) fclose(f);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a new copy of a string s. mm_strdup() is a common routine, but
|
||||
* not part of ANSI C, so it is included here. Used by mm_typecode_to_str().
|
||||
*
|
||||
*/
|
||||
char *mm_strdup(const char *s)
|
||||
{
|
||||
int len = strlen(s);
|
||||
char *s2 = (char *) malloc((len+1)*sizeof(char));
|
||||
return strcpy(s2, s);
|
||||
}
|
||||
|
||||
char *mm_typecode_to_str(MM_typecode matcode)
|
||||
{
|
||||
char buffer[MM_MAX_LINE_LENGTH];
|
||||
char *types[4];
|
||||
char *mm_strdup(const char *);
|
||||
int error =0;
|
||||
|
||||
/* check for MTX type */
|
||||
if (mm_is_matrix(matcode))
|
||||
types[0] = MM_MTX_STR;
|
||||
else
|
||||
error=1;
|
||||
|
||||
/* check for CRD or ARR matrix */
|
||||
if (mm_is_sparse(matcode))
|
||||
types[1] = MM_SPARSE_STR;
|
||||
else
|
||||
if (mm_is_dense(matcode))
|
||||
types[1] = MM_DENSE_STR;
|
||||
else
|
||||
return NULL;
|
||||
|
||||
/* check for element data type */
|
||||
if (mm_is_real(matcode))
|
||||
types[2] = MM_REAL_STR;
|
||||
else
|
||||
if (mm_is_complex(matcode))
|
||||
types[2] = MM_COMPLEX_STR;
|
||||
else
|
||||
if (mm_is_pattern(matcode))
|
||||
types[2] = MM_PATTERN_STR;
|
||||
else
|
||||
if (mm_is_integer(matcode))
|
||||
types[2] = MM_INT_STR;
|
||||
else
|
||||
return NULL;
|
||||
|
||||
|
||||
/* check for symmetry type */
|
||||
if (mm_is_general(matcode))
|
||||
types[3] = MM_GENERAL_STR;
|
||||
else
|
||||
if (mm_is_symmetric(matcode))
|
||||
types[3] = MM_SYMM_STR;
|
||||
else
|
||||
if (mm_is_hermitian(matcode))
|
||||
types[3] = MM_HERM_STR;
|
||||
else
|
||||
if (mm_is_skew(matcode))
|
||||
types[3] = MM_SKEW_STR;
|
||||
else
|
||||
return NULL;
|
||||
|
||||
sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
|
||||
return mm_strdup(buffer);
|
||||
|
||||
}
|
||||
135
examples/gmres/mmio.h
Normal file
135
examples/gmres/mmio.h
Normal file
@@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Matrix Market I/O library for ANSI C
|
||||
*
|
||||
* See http://math.nist.gov/MatrixMarket for details.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MM_IO_H
|
||||
#define MM_IO_H
|
||||
|
||||
#define MM_MAX_LINE_LENGTH 1025
|
||||
#define MatrixMarketBanner "%%MatrixMarket"
|
||||
#define MM_MAX_TOKEN_LENGTH 64
|
||||
|
||||
typedef char MM_typecode[4];
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
char *mm_typecode_to_str(MM_typecode matcode);
|
||||
|
||||
int mm_read_banner(FILE *f, MM_typecode *matcode);
|
||||
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
|
||||
int mm_read_mtx_array_size(FILE *f, int *M, int *N);
|
||||
|
||||
int mm_write_banner(FILE *f, MM_typecode matcode);
|
||||
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
|
||||
int mm_write_mtx_array_size(FILE *f, int M, int N);
|
||||
|
||||
|
||||
/********************* MM_typecode query fucntions ***************************/
|
||||
|
||||
#define mm_is_matrix(typecode) ((typecode)[0]=='M')
|
||||
|
||||
#define mm_is_sparse(typecode) ((typecode)[1]=='C')
|
||||
#define mm_is_coordinate(typecode)((typecode)[1]=='C')
|
||||
#define mm_is_dense(typecode) ((typecode)[1]=='A')
|
||||
#define mm_is_array(typecode) ((typecode)[1]=='A')
|
||||
|
||||
#define mm_is_complex(typecode) ((typecode)[2]=='C')
|
||||
#define mm_is_real(typecode) ((typecode)[2]=='R')
|
||||
#define mm_is_pattern(typecode) ((typecode)[2]=='P')
|
||||
#define mm_is_integer(typecode) ((typecode)[2]=='I')
|
||||
|
||||
#define mm_is_symmetric(typecode)((typecode)[3]=='S')
|
||||
#define mm_is_general(typecode) ((typecode)[3]=='G')
|
||||
#define mm_is_skew(typecode) ((typecode)[3]=='K')
|
||||
#define mm_is_hermitian(typecode)((typecode)[3]=='H')
|
||||
|
||||
int mm_is_valid(MM_typecode matcode); /* too complex for a macro */
|
||||
|
||||
|
||||
/********************* MM_typecode modify fucntions ***************************/
|
||||
|
||||
#define mm_set_matrix(typecode) ((*typecode)[0]='M')
|
||||
#define mm_set_coordinate(typecode) ((*typecode)[1]='C')
|
||||
#define mm_set_array(typecode) ((*typecode)[1]='A')
|
||||
#define mm_set_dense(typecode) mm_set_array(typecode)
|
||||
#define mm_set_sparse(typecode) mm_set_coordinate(typecode)
|
||||
|
||||
#define mm_set_complex(typecode)((*typecode)[2]='C')
|
||||
#define mm_set_real(typecode) ((*typecode)[2]='R')
|
||||
#define mm_set_pattern(typecode)((*typecode)[2]='P')
|
||||
#define mm_set_integer(typecode)((*typecode)[2]='I')
|
||||
|
||||
|
||||
#define mm_set_symmetric(typecode)((*typecode)[3]='S')
|
||||
#define mm_set_general(typecode)((*typecode)[3]='G')
|
||||
#define mm_set_skew(typecode) ((*typecode)[3]='K')
|
||||
#define mm_set_hermitian(typecode)((*typecode)[3]='H')
|
||||
|
||||
#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
|
||||
(*typecode)[2]=' ',(*typecode)[3]='G')
|
||||
|
||||
#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
|
||||
|
||||
|
||||
/********************* Matrix Market error codes ***************************/
|
||||
|
||||
|
||||
#define MM_COULD_NOT_READ_FILE 11
|
||||
#define MM_PREMATURE_EOF 12
|
||||
#define MM_NOT_MTX 13
|
||||
#define MM_NO_HEADER 14
|
||||
#define MM_UNSUPPORTED_TYPE 15
|
||||
#define MM_LINE_TOO_LONG 16
|
||||
#define MM_COULD_NOT_WRITE_FILE 17
|
||||
|
||||
|
||||
/******************** Matrix Market internal definitions ********************
|
||||
|
||||
MM_matrix_typecode: 4-character sequence
|
||||
|
||||
ojbect sparse/ data storage
|
||||
dense type scheme
|
||||
|
||||
string position: [0] [1] [2] [3]
|
||||
|
||||
Matrix typecode: M(atrix) C(oord) R(eal) G(eneral)
|
||||
A(array) C(omplex) H(ermitian)
|
||||
P(attern) S(ymmetric)
|
||||
I(nteger) K(kew)
|
||||
|
||||
***********************************************************************/
|
||||
|
||||
#define MM_MTX_STR "matrix"
|
||||
#define MM_ARRAY_STR "array"
|
||||
#define MM_DENSE_STR "array"
|
||||
#define MM_COORDINATE_STR "coordinate"
|
||||
#define MM_SPARSE_STR "coordinate"
|
||||
#define MM_COMPLEX_STR "complex"
|
||||
#define MM_REAL_STR "real"
|
||||
#define MM_INT_STR "integer"
|
||||
#define MM_GENERAL_STR "general"
|
||||
#define MM_SYMM_STR "symmetric"
|
||||
#define MM_HERM_STR "hermitian"
|
||||
#define MM_SKEW_STR "skew-symmetric"
|
||||
#define MM_PATTERN_STR "pattern"
|
||||
|
||||
|
||||
/* high level routines */
|
||||
|
||||
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
|
||||
double val[], MM_typecode matcode);
|
||||
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
|
||||
double val[], MM_typecode matcode);
|
||||
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
|
||||
MM_typecode matcode);
|
||||
|
||||
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
|
||||
double **val_, int **I_, int **J_);
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
53
examples/gmres/util.h
Normal file
53
examples/gmres/util.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __UTIL_H__
|
||||
#define __UTIL_H__
|
||||
|
||||
#include <stdio.h>
|
||||
#include "matrix.h"
|
||||
|
||||
|
||||
inline void printMatrix (DenseMatrix &M, const char *name) {
|
||||
printf("Matrix %s:\n", name);
|
||||
for (int row = 0; row < M.rows(); row++) {
|
||||
printf("row %2d: ", row + 1);
|
||||
for (int col = 0; col < M.cols(); col++)
|
||||
printf("%6f ", M(row, col));
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
|
||||
|
||||
struct __vec16_i1 {
|
||||
__vec16_i1() { }
|
||||
__vec16_i1(const uint16_t &vv) : v(vv) { }
|
||||
__vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
|
||||
uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
|
||||
uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
|
||||
@@ -193,13 +194,22 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define CMP_OP(TYPE, CAST, NAME, OP) \
|
||||
static FORCEINLINE __vec16_i1 NAME(TYPE a, TYPE b) { \
|
||||
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
|
||||
static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
|
||||
__vec16_i1 ret; \
|
||||
ret.v = 0; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \
|
||||
return ret; \
|
||||
} \
|
||||
static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \
|
||||
__vec16_i1 mask) { \
|
||||
__vec16_i1 ret; \
|
||||
ret.v = 0; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \
|
||||
ret.v &= mask.v; \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define INSERT_EXTRACT(VTYPE, STYPE) \
|
||||
@@ -211,14 +221,16 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
|
||||
}
|
||||
|
||||
#define LOAD_STORE(VTYPE, STYPE) \
|
||||
static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
|
||||
template <int ALIGN> \
|
||||
static FORCEINLINE VTYPE __load(const VTYPE *p) { \
|
||||
STYPE *ptr = (STYPE *)p; \
|
||||
VTYPE ret; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ret.v[i] = ptr[i]; \
|
||||
return ret; \
|
||||
} \
|
||||
static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) { \
|
||||
template <int ALIGN> \
|
||||
static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \
|
||||
STYPE *ptr = (STYPE *)p; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ptr[i] = v.v[i]; \
|
||||
@@ -259,13 +271,29 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define SMEAR(VTYPE, NAME, STYPE) \
|
||||
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
|
||||
VTYPE ret; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ret.v[i] = v; \
|
||||
return ret; \
|
||||
} \
|
||||
#define SMEAR(VTYPE, NAME, STYPE) \
|
||||
template <class RetVecType> VTYPE __smear_##NAME(STYPE); \
|
||||
template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) { \
|
||||
VTYPE ret; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ret.v[i] = v; \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define SETZERO(VTYPE, NAME) \
|
||||
template <class RetVecType> VTYPE __setzero_##NAME(); \
|
||||
template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() { \
|
||||
VTYPE ret; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ret.v[i] = 0; \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define UNDEF(VTYPE, NAME) \
|
||||
template <class RetVecType> VTYPE __undef_##NAME(); \
|
||||
template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() { \
|
||||
return VTYPE(); \
|
||||
}
|
||||
|
||||
#define BROADCAST(VTYPE, NAME, STYPE) \
|
||||
static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \
|
||||
@@ -311,11 +339,23 @@ INSERT_EXTRACT(__vec1_d, double)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// mask ops
|
||||
|
||||
static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
|
||||
return mask.v;
|
||||
static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
|
||||
return (uint64_t)mask.v;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
|
||||
static FORCEINLINE bool __any(__vec16_i1 mask) {
|
||||
return (mask.v!=0);
|
||||
}
|
||||
|
||||
static FORCEINLINE bool __all(__vec16_i1 mask) {
|
||||
return (mask.v==0xFFFF);
|
||||
}
|
||||
|
||||
static FORCEINLINE bool __none(__vec16_i1 mask) {
|
||||
return (mask.v==0);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
|
||||
__vec16_i1 r;
|
||||
r.v = (a.v & b.v) | (~a.v & ~b.v);
|
||||
return r;
|
||||
@@ -339,6 +379,24 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
|
||||
__vec16_i1 r;
|
||||
r.v = ~v.v;
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
|
||||
__vec16_i1 r;
|
||||
r.v = ~a.v & b.v;
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
|
||||
__vec16_i1 r;
|
||||
r.v = a.v & ~b.v;
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a,
|
||||
__vec16_i1 b) {
|
||||
__vec16_i1 r;
|
||||
@@ -362,18 +420,36 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
||||
vec->v |= (1 << index);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p, int align) {
|
||||
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
|
||||
uint16_t *ptr = (uint16_t *)p;
|
||||
__vec16_i1 r;
|
||||
r.v = *ptr;
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
|
||||
template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
|
||||
uint16_t *ptr = (uint16_t *)p;
|
||||
*ptr = v.v;
|
||||
}
|
||||
|
||||
template <class RetVecType> __vec16_i1 __smear_i1(int i);
|
||||
template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int v) {
|
||||
return __vec16_i1(v, v, v, v, v, v, v, v,
|
||||
v, v, v, v, v, v, v, v);
|
||||
}
|
||||
|
||||
template <class RetVecType> __vec16_i1 __setzero_i1();
|
||||
template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
|
||||
return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
template <class RetVecType> __vec16_i1 __undef_i1();
|
||||
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
|
||||
return __vec16_i1();
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// int8
|
||||
|
||||
@@ -398,20 +474,22 @@ SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i8, int8_t, __equal, ==)
|
||||
CMP_OP(__vec16_i8, int8_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i8, int8_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_equal, >=)
|
||||
CMP_OP(__vec16_i8, int8_t, __signed_greater_equal, >=)
|
||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_than, <)
|
||||
CMP_OP(__vec16_i8, int8_t, __signed_less_than, <)
|
||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_than, >)
|
||||
CMP_OP(__vec16_i8, int8_t, __signed_greater_than, >)
|
||||
CMP_OP(__vec16_i8, i8, int8_t, __equal, ==)
|
||||
CMP_OP(__vec16_i8, i8, int8_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
|
||||
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_equal, >=)
|
||||
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
|
||||
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_than, <)
|
||||
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
|
||||
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >)
|
||||
|
||||
SELECT(__vec16_i8)
|
||||
INSERT_EXTRACT(__vec16_i8, int8_t)
|
||||
SMEAR(__vec16_i8, i8, int8_t)
|
||||
SETZERO(__vec16_i8, i8)
|
||||
UNDEF(__vec16_i8, i8)
|
||||
BROADCAST(__vec16_i8, i8, int8_t)
|
||||
ROTATE(__vec16_i8, i8, int8_t)
|
||||
SHUFFLES(__vec16_i8, i8, int8_t)
|
||||
@@ -441,20 +519,22 @@ SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i16, int16_t, __equal, ==)
|
||||
CMP_OP(__vec16_i16, int16_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i16, int16_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_equal, >=)
|
||||
CMP_OP(__vec16_i16, int16_t, __signed_greater_equal, >=)
|
||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_than, <)
|
||||
CMP_OP(__vec16_i16, int16_t, __signed_less_than, <)
|
||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_than, >)
|
||||
CMP_OP(__vec16_i16, int16_t, __signed_greater_than, >)
|
||||
CMP_OP(__vec16_i16, i16, int16_t, __equal, ==)
|
||||
CMP_OP(__vec16_i16, i16, int16_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
|
||||
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_equal, >=)
|
||||
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
|
||||
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_than, <)
|
||||
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
|
||||
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >)
|
||||
|
||||
SELECT(__vec16_i16)
|
||||
INSERT_EXTRACT(__vec16_i16, int16_t)
|
||||
SMEAR(__vec16_i16, i16, int16_t)
|
||||
SETZERO(__vec16_i16, i16)
|
||||
UNDEF(__vec16_i16, i16)
|
||||
BROADCAST(__vec16_i16, i16, int16_t)
|
||||
ROTATE(__vec16_i16, i16, int16_t)
|
||||
SHUFFLES(__vec16_i16, i16, int16_t)
|
||||
@@ -484,20 +564,22 @@ SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i32, int32_t, __equal, ==)
|
||||
CMP_OP(__vec16_i32, int32_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i32, int32_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_equal, >=)
|
||||
CMP_OP(__vec16_i32, int32_t, __signed_greater_equal, >=)
|
||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_than, <)
|
||||
CMP_OP(__vec16_i32, int32_t, __signed_less_than, <)
|
||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_than, >)
|
||||
CMP_OP(__vec16_i32, int32_t, __signed_greater_than, >)
|
||||
CMP_OP(__vec16_i32, i32, int32_t, __equal, ==)
|
||||
CMP_OP(__vec16_i32, i32, int32_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
|
||||
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_equal, >=)
|
||||
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
|
||||
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_than, <)
|
||||
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
|
||||
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >)
|
||||
|
||||
SELECT(__vec16_i32)
|
||||
INSERT_EXTRACT(__vec16_i32, int32_t)
|
||||
SMEAR(__vec16_i32, i32, int32_t)
|
||||
SETZERO(__vec16_i32, i32)
|
||||
UNDEF(__vec16_i32, i32)
|
||||
BROADCAST(__vec16_i32, i32, int32_t)
|
||||
ROTATE(__vec16_i32, i32, int32_t)
|
||||
SHUFFLES(__vec16_i32, i32, int32_t)
|
||||
@@ -527,20 +609,22 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i64, int64_t, __equal, ==)
|
||||
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
|
||||
CMP_OP(__vec16_i64, int64_t, __signed_greater_equal, >=)
|
||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <)
|
||||
CMP_OP(__vec16_i64, int64_t, __signed_less_than, <)
|
||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >)
|
||||
CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >)
|
||||
CMP_OP(__vec16_i64, i64, int64_t, __equal, ==)
|
||||
CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
|
||||
CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_equal, >=)
|
||||
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
|
||||
CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <)
|
||||
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
|
||||
CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >)
|
||||
|
||||
SELECT(__vec16_i64)
|
||||
INSERT_EXTRACT(__vec16_i64, int64_t)
|
||||
SMEAR(__vec16_i64, i64, int64_t)
|
||||
SETZERO(__vec16_i64, i64)
|
||||
UNDEF(__vec16_i64, i64)
|
||||
BROADCAST(__vec16_i64, i64, int64_t)
|
||||
ROTATE(__vec16_i64, i64, int64_t)
|
||||
SHUFFLES(__vec16_i64, i64, int64_t)
|
||||
@@ -554,14 +638,14 @@ BINARY_OP(__vec16_f, __sub, -)
|
||||
BINARY_OP(__vec16_f, __mul, *)
|
||||
BINARY_OP(__vec16_f, __div, /)
|
||||
|
||||
CMP_OP(__vec16_f, float, __equal, ==)
|
||||
CMP_OP(__vec16_f, float, __not_equal, !=)
|
||||
CMP_OP(__vec16_f, float, __less_than, <)
|
||||
CMP_OP(__vec16_f, float, __less_equal, <=)
|
||||
CMP_OP(__vec16_f, float, __greater_than, >)
|
||||
CMP_OP(__vec16_f, float, __greater_equal, >=)
|
||||
CMP_OP(__vec16_f, float, float, __equal, ==)
|
||||
CMP_OP(__vec16_f, float, float, __not_equal, !=)
|
||||
CMP_OP(__vec16_f, float, float, __less_than, <)
|
||||
CMP_OP(__vec16_f, float, float, __less_equal, <=)
|
||||
CMP_OP(__vec16_f, float, float, __greater_than, >)
|
||||
CMP_OP(__vec16_f, float, float, __greater_equal, >=)
|
||||
|
||||
static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
|
||||
static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
|
||||
__vec16_i1 ret;
|
||||
ret.v = 0;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -569,6 +653,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
|
||||
__vec16_i1 ret;
|
||||
ret.v = 0;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if 0
|
||||
case Instruction::FRem: intrinsic = "__frem"; break;
|
||||
#endif
|
||||
@@ -576,11 +668,128 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
|
||||
SELECT(__vec16_f)
|
||||
INSERT_EXTRACT(__vec16_f, float)
|
||||
SMEAR(__vec16_f, float, float)
|
||||
SETZERO(__vec16_f, float)
|
||||
UNDEF(__vec16_f, float)
|
||||
BROADCAST(__vec16_f, float, float)
|
||||
ROTATE(__vec16_f, float, float)
|
||||
SHUFFLES(__vec16_f, float, float)
|
||||
LOAD_STORE(__vec16_f, float)
|
||||
|
||||
static FORCEINLINE float __exp_uniform_float(float v) {
|
||||
return expf(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
|
||||
__vec16_f ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = expf(v.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __log_uniform_float(float v) {
|
||||
return logf(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) {
|
||||
__vec16_f ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = logf(v.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __pow_uniform_float(float a, float b) {
|
||||
return powf(a, b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) {
|
||||
__vec16_f ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = powf(a.v[i], b.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE int __intbits(float v) {
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} u;
|
||||
u.f = v;
|
||||
return u.i;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __floatbits(int v) {
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} u;
|
||||
u.i = v;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __half_to_float_uniform(int16_t h) {
|
||||
static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
|
||||
|
||||
int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits
|
||||
uint32_t exp = shifted_exp & o; // just the exponent
|
||||
o += (127 - 15) << 23; // exponent adjust
|
||||
|
||||
// handle exponent special cases
|
||||
if (exp == shifted_exp) // Inf/NaN?
|
||||
o += (128 - 16) << 23; // extra exp adjust
|
||||
else if (exp == 0) { // Zero/Denormal?
|
||||
o += 1 << 23; // extra exp adjust
|
||||
o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
|
||||
}
|
||||
|
||||
o |= ((int32_t)(h & 0x8000)) << 16; // sign bit
|
||||
return __floatbits(o);
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
|
||||
__vec16_f ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = __half_to_float_uniform(v.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE int16_t __float_to_half_uniform(float f) {
|
||||
uint32_t sign_mask = 0x80000000u;
|
||||
int32_t o;
|
||||
|
||||
int32_t fint = __intbits(f);
|
||||
int32_t sign = fint & sign_mask;
|
||||
fint ^= sign;
|
||||
|
||||
int32_t f32infty = 255 << 23;
|
||||
o = (fint > f32infty) ? 0x7e00 : 0x7c00;
|
||||
|
||||
// (De)normalized number or zero
|
||||
// update fint unconditionally to save the blending; we don't need it
|
||||
// anymore for the Inf/NaN case anyway.
|
||||
const uint32_t round_mask = ~0xfffu;
|
||||
const int32_t magic = 15 << 23;
|
||||
const int32_t f16infty = 31 << 23;
|
||||
|
||||
int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
|
||||
fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
|
||||
|
||||
if (fint < f32infty)
|
||||
o = fint2 >> 13; // Take the bits!
|
||||
|
||||
return (o | (sign >> 16));
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
|
||||
__vec16_i16 ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = __float_to_half_uniform(v.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// double
|
||||
|
||||
@@ -589,14 +798,14 @@ BINARY_OP(__vec16_d, __sub, -)
|
||||
BINARY_OP(__vec16_d, __mul, *)
|
||||
BINARY_OP(__vec16_d, __div, /)
|
||||
|
||||
CMP_OP(__vec16_d, double, __equal, ==)
|
||||
CMP_OP(__vec16_d, double, __not_equal, !=)
|
||||
CMP_OP(__vec16_d, double, __less_than, <)
|
||||
CMP_OP(__vec16_d, double, __less_equal, <=)
|
||||
CMP_OP(__vec16_d, double, __greater_than, >)
|
||||
CMP_OP(__vec16_d, double, __greater_equal, >=)
|
||||
CMP_OP(__vec16_d, double, double, __equal, ==)
|
||||
CMP_OP(__vec16_d, double, double, __not_equal, !=)
|
||||
CMP_OP(__vec16_d, double, double, __less_than, <)
|
||||
CMP_OP(__vec16_d, double, double, __less_equal, <=)
|
||||
CMP_OP(__vec16_d, double, double, __greater_than, >)
|
||||
CMP_OP(__vec16_d, double, double, __greater_equal, >=)
|
||||
|
||||
static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
|
||||
static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.v = 0;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -604,6 +813,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.v = 0;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if 0
|
||||
case Instruction::FRem: intrinsic = "__frem"; break;
|
||||
#endif
|
||||
@@ -611,6 +828,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
|
||||
SELECT(__vec16_d)
|
||||
INSERT_EXTRACT(__vec16_d, double)
|
||||
SMEAR(__vec16_d, double, double)
|
||||
SETZERO(__vec16_d, double)
|
||||
UNDEF(__vec16_d, double)
|
||||
BROADCAST(__vec16_d, double, double)
|
||||
ROTATE(__vec16_d, double, double)
|
||||
SHUFFLES(__vec16_d, double, double)
|
||||
@@ -962,8 +1181,8 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// masked load/store
|
||||
|
||||
static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i8 ret;
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -972,8 +1191,8 @@ static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i16 ret;
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -982,8 +1201,8 @@ static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 ret;
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -992,8 +1211,18 @@ static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_f __masked_load_float(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_f ret;
|
||||
float *ptr = (float *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ret.v[i] = ptr[i];
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i64 ret;
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1002,31 +1231,49 @@ static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_d __masked_load_double(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_d ret;
|
||||
double *ptr = (double *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ret.v[i] = ptr[i];
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
|
||||
static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
|
||||
__vec16_i1 mask) {
|
||||
float *ptr = (float *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1034,24 +1281,42 @@ static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_8(p, val, mask);
|
||||
static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
|
||||
__vec16_i1 mask) {
|
||||
double *ptr = (double *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
|
||||
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_16(p, val, mask);
|
||||
__masked_store_i8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_32(p, val, mask);
|
||||
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_i16(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_64(p, val, mask);
|
||||
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_i32(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_float(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_i64(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_double(p, val, mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1060,29 +1325,31 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec16_i1 mask) { \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
|
||||
OTYPE offset, __vec16_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
|
||||
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
||||
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float)
|
||||
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
||||
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double)
|
||||
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double)
|
||||
|
||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \
|
||||
@@ -1095,39 +1362,46 @@ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8)
|
||||
GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8)
|
||||
GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8)
|
||||
GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8)
|
||||
GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __gather32_i16)
|
||||
GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
|
||||
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
|
||||
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
|
||||
GATHER_GENERAL(__vec16_f, float, __vec16_i32, __gather32_float)
|
||||
GATHER_GENERAL(__vec16_f, float, __vec16_i64, __gather64_float)
|
||||
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
|
||||
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
|
||||
GATHER_GENERAL(__vec16_d, double, __vec16_i32, __gather32_double)
|
||||
GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double)
|
||||
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec16_i1 mask) { \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
|
||||
OTYPE offset, VTYPE val, \
|
||||
__vec16_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float)
|
||||
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double)
|
||||
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double)
|
||||
|
||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
||||
@@ -1139,14 +1413,18 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
||||
} \
|
||||
}
|
||||
|
||||
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
|
||||
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
|
||||
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
|
||||
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
|
||||
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
|
||||
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
|
||||
SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32)
|
||||
SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
|
||||
SCATTER_GENERAL(__vec16_f, float, __vec16_i32, __scatter32_float)
|
||||
SCATTER_GENERAL(__vec16_f, float, __vec16_i64, __scatter64_float)
|
||||
SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64)
|
||||
SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
|
||||
SCATTER_GENERAL(__vec16_d, double, __vec16_i32, __scatter32_double)
|
||||
SCATTER_GENERAL(__vec16_d, double, __vec16_i64, __scatter64_double)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// packed load/store
|
||||
|
||||
1828
examples/intrinsics/generic-32.h
Normal file
1828
examples/intrinsics/generic-32.h
Normal file
File diff suppressed because it is too large
Load Diff
1961
examples/intrinsics/generic-64.h
Normal file
1961
examples/intrinsics/generic-64.h
Normal file
File diff suppressed because it is too large
Load Diff
2800
examples/intrinsics/knc.h
Normal file
2800
examples/intrinsics/knc.h
Normal file
File diff suppressed because it is too large
Load Diff
2058
examples/intrinsics/knc2x.h
Normal file
2058
examples/intrinsics/knc2x.h
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -40,8 +40,10 @@ static inline int mandel(float c_re, float c_im, int count) {
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
float new_im = 2.f * z_re * z_im;
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
unmasked {
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -41,8 +41,10 @@ mandel(float c_re, float c_im, int count) {
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
float new_im = 2.f * z_re * z_im;
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
unmasked {
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
@@ -79,6 +81,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float dy = (y1 - y0) / height;
|
||||
uniform int span = 4;
|
||||
|
||||
launch[height/span] < mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
|
||||
maxIterations, output) >;
|
||||
launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
|
||||
maxIterations, output);
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int nTasks = max((int)64, (int)count/16384);
|
||||
launch[nTasks] < bs_task(Sa, Xa, Ta, ra, va, result, count) >;
|
||||
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
}
|
||||
|
||||
|
||||
@@ -150,5 +150,5 @@ binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int nTasks = max((int)64, (int)count/16384);
|
||||
launch[nTasks] < binomial_task(Sa, Xa, Ta, ra, va, result, count) >;
|
||||
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
}
|
||||
|
||||
7
examples/perfbench/Makefile
Normal file
7
examples/perfbench/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
EXAMPLE=perbench
|
||||
CPP_SRC=perfbench.cpp perfbench_serial.cpp
|
||||
ISPC_SRC=perfbench.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx
|
||||
|
||||
include ../common.mk
|
||||
108
examples/perfbench/perfbench.cpp
Normal file
108
examples/perfbench/perfbench.cpp
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
|
||||
#include "perfbench_ispc.h"
|
||||
|
||||
typedef void (FuncType)(float *, int, float *, float *);
|
||||
|
||||
struct PerfTest {
|
||||
FuncType *aFunc;
|
||||
const char *aName;
|
||||
FuncType *bFunc;
|
||||
const char *bName;
|
||||
const char *testName;
|
||||
};
|
||||
|
||||
extern void xyzSumAOS(float *a, int count, float *zeros, float *result);
|
||||
extern void xyzSumSOA(float *a, int count, float *zeros, float *result);
|
||||
|
||||
|
||||
static void
|
||||
lInitData(float *ptr, int count) {
|
||||
for (int i = 0; i < count; ++i)
|
||||
ptr[i] = float(i) / (1024.f * 1024.f);
|
||||
}
|
||||
|
||||
static PerfTest tests[] = {
|
||||
{ xyzSumAOS, "serial", ispc::xyzSumAOS, "ispc", "AOS vector element sum (with coalescing)" },
|
||||
{ xyzSumAOS, "serial", ispc::xyzSumAOSStdlib, "ispc", "AOS vector element sum (stdlib swizzle)" },
|
||||
{ xyzSumAOS, "serial", ispc::xyzSumAOSNoCoalesce, "ispc", "AOS vector element sum (no coalescing)" },
|
||||
{ xyzSumSOA, "serial", ispc::xyzSumSOA, "ispc", "SOA vector element sum" },
|
||||
{ ispc::gathers, "gather", ispc::loads, "vector load", "Memory reads" },
|
||||
{ ispc::scatters, "scatter", ispc::stores, "vector store", "Memory writes" },
|
||||
};
|
||||
|
||||
int main() {
|
||||
int count = 3*64*1024;
|
||||
float *a = new float[count];
|
||||
float zeros[32] = { 0 };
|
||||
|
||||
int nTests = sizeof(tests) / sizeof(tests[0]);
|
||||
for (int i = 0; i < nTests; ++i) {
|
||||
lInitData(a, count);
|
||||
reset_and_start_timer();
|
||||
float resultA[3] = { 0, 0, 0 };
|
||||
for (int j = 0; j < 100; ++j)
|
||||
tests[i].aFunc(a, count, zeros, resultA);
|
||||
double aTime = get_elapsed_mcycles();
|
||||
|
||||
lInitData(a, count);
|
||||
reset_and_start_timer();
|
||||
float resultB[3] = { 0, 0, 0 };
|
||||
for (int j = 0; j < 100; ++j)
|
||||
tests[i].bFunc(a, count, zeros, resultB);
|
||||
double bTime = get_elapsed_mcycles();
|
||||
|
||||
printf("%-40s: [%.2f] M cycles %s, [%.2f] M cycles %s (%.2fx speedup).\n",
|
||||
tests[i].testName, aTime, tests[i].aName, bTime, tests[i].bName,
|
||||
aTime/bTime);
|
||||
#if 0
|
||||
printf("\t(%f %f %f) - (%f %f %f)\n", resultSerial[0], resultSerial[1],
|
||||
resultSerial[2], resultISPC[0], resultISPC[1], resultISPC[2]);
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
170
examples/perfbench/perfbench.ispc
Normal file
170
examples/perfbench/perfbench.ispc
Normal file
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
export void xyzSumAOS(uniform float array[], uniform int count,
|
||||
uniform float zeros[], uniform float result[]) {
|
||||
float xsum = 0, ysum = 0, zsum = 0;
|
||||
foreach (i = 0 ... count/3) {
|
||||
float x = array[3*i];
|
||||
float y = array[3*i+1];
|
||||
float z = array[3*i+2];
|
||||
|
||||
xsum += x;
|
||||
ysum += y;
|
||||
zsum += z;
|
||||
}
|
||||
result[0] = reduce_add(xsum);
|
||||
result[1] = reduce_add(ysum);
|
||||
result[2] = reduce_add(zsum);
|
||||
}
|
||||
|
||||
export void xyzSumAOSStdlib(uniform float array[], uniform int count,
|
||||
uniform float zeros[], uniform float result[]) {
|
||||
float xsum = 0, ysum = 0, zsum = 0;
|
||||
for (uniform int i = 0; i < 64*1024 /*count/3*/; i += programCount) {
|
||||
float x, y, z;
|
||||
aos_to_soa3(&array[3*i], &x, &y, &z);
|
||||
|
||||
xsum += x;
|
||||
ysum += y;
|
||||
zsum += z;
|
||||
}
|
||||
result[0] = reduce_add(xsum);
|
||||
result[1] = reduce_add(ysum);
|
||||
result[2] = reduce_add(zsum);
|
||||
}
|
||||
|
||||
export void xyzSumAOSNoCoalesce(uniform float array[], uniform int count,
|
||||
uniform float zerosArray[], uniform float result[]) {
|
||||
int zeros = zerosArray[programIndex];
|
||||
float xsum = 0, ysum = 0, zsum = 0;
|
||||
foreach (i = 0 ... count/3) {
|
||||
float x = array[3*i+zeros];
|
||||
float y = array[3*i+1+zeros];
|
||||
float z = array[3*i+2+zeros];
|
||||
|
||||
xsum += x;
|
||||
ysum += y;
|
||||
zsum += z;
|
||||
}
|
||||
result[0] = reduce_add(xsum);
|
||||
result[1] = reduce_add(ysum);
|
||||
result[2] = reduce_add(zsum);
|
||||
}
|
||||
|
||||
export void xyzSumSOA(uniform float array[], uniform int count,
|
||||
uniform float zeros[], uniform float result[]) {
|
||||
float xsum = 0, ysum = 0, zsum = 0;
|
||||
uniform float * uniform ap = array;
|
||||
assert(programCount <= 8);
|
||||
|
||||
for (uniform int i = 0; i < count/3; i += 8, ap += 24) {
|
||||
for (uniform int j = 0; j < 8; j += programCount) {
|
||||
float x = ap[j + programIndex];
|
||||
float y = ap[8 + j + programIndex];
|
||||
float z = ap[16 + j + programIndex];
|
||||
|
||||
xsum += x;
|
||||
ysum += y;
|
||||
zsum += z;
|
||||
}
|
||||
}
|
||||
result[0] = reduce_add(xsum);
|
||||
result[1] = reduce_add(ysum);
|
||||
result[2] = reduce_add(zsum);
|
||||
}
|
||||
|
||||
export void gathers(uniform float array[], uniform int count,
|
||||
uniform float zeros[], uniform float result[]) {
|
||||
float sum = 0;
|
||||
int zero = zeros[programIndex];
|
||||
foreach (i = 0 ... count)
|
||||
sum += array[i + zero];
|
||||
result[0] = reduce_add(sum);
|
||||
}
|
||||
|
||||
|
||||
export void loads(uniform float array[], uniform int count,
|
||||
uniform float zeros[], uniform float result[]) {
|
||||
float sum = 0;
|
||||
foreach (i = 0 ... count)
|
||||
sum += array[i];
|
||||
result[0] = reduce_add(sum);
|
||||
}
|
||||
|
||||
|
||||
export void scatters(uniform float array[], uniform int count,
|
||||
uniform float zeros[], uniform float result[]) {
|
||||
int zero = zeros[programIndex];
|
||||
foreach (i = 0 ... count)
|
||||
array[i + zero] = zero;
|
||||
}
|
||||
|
||||
|
||||
export void stores(uniform float array[], uniform int count,
|
||||
uniform float zeros[], uniform float result[]) {
|
||||
int zero = zeros[programIndex];
|
||||
foreach (i = 0 ... count)
|
||||
array[i] = zero;
|
||||
}
|
||||
|
||||
export void normalizeAOSNoCoalesce(uniform float array[], uniform int count,
|
||||
uniform float zeroArray[]) {
|
||||
float zeros = zeroArray[programIndex];
|
||||
foreach (i = 0 ... count/3) {
|
||||
float x = array[3*i+zeros];
|
||||
float y = array[3*i+1+zeros];
|
||||
float z = array[3*i+2+zeros];
|
||||
|
||||
float l2 = x*x + y*y + z*z;
|
||||
|
||||
array[3*i] /= l2;
|
||||
array[3*i+1] /= l2;
|
||||
array[3*i+2] /= l2;
|
||||
}
|
||||
}
|
||||
|
||||
export void normalizeSOA(uniform float array[], uniform int count,
|
||||
uniform float zeros[]) {
|
||||
foreach (i = 0 ... count/3) {
|
||||
float x = array[3*i];
|
||||
float y = array[3*i+1];
|
||||
float z = array[3*i+2];
|
||||
|
||||
float l2 = x*x + y*y + z*z;
|
||||
|
||||
array[3*i] /= l2;
|
||||
array[3*i+1] /= l2;
|
||||
array[3*i+2] /= l2;
|
||||
}
|
||||
}
|
||||
175
examples/perfbench/perfbench.vcxproj
Normal file
175
examples/perfbench/perfbench.vcxproj
Normal file
@@ -0,0 +1,175 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{d923bb7e-a7c8-4850-8fcf-0eb9ce35b4e8}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>perfbench</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="perfbench.cpp" />
|
||||
<ClCompile Include="perfbench_serial.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="perfbench.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
61
examples/perfbench/perfbench_serial.cpp
Normal file
61
examples/perfbench/perfbench_serial.cpp
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
|
||||
void
|
||||
xyzSumAOS(float *a, int count, float *zeros, float *result) {
|
||||
float xsum = 0, ysum = 0, zsum = 0;
|
||||
for (int i = 0; i < count; i += 3) {
|
||||
xsum += a[i];
|
||||
ysum += a[i+1];
|
||||
zsum += a[i+2];
|
||||
}
|
||||
result[0] = xsum;
|
||||
result[1] = ysum;
|
||||
result[2] = zsum;
|
||||
}
|
||||
|
||||
void
|
||||
xyzSumSOA(float *a, int count, float *zeros, float *result) {
|
||||
float xsum = 0, ysum = 0, zsum = 0;
|
||||
for (int i = 0; i < count/3; ++i) {
|
||||
float *p = a + (i >> 3) * 24 + (i & 7);
|
||||
xsum += p[0];
|
||||
ysum += p[8];
|
||||
zsum += p[16];
|
||||
}
|
||||
result[0] = xsum;
|
||||
result[1] = ysum;
|
||||
result[2] = zsum;
|
||||
}
|
||||
@@ -43,17 +43,17 @@ struct Ray {
|
||||
};
|
||||
|
||||
struct Triangle {
|
||||
uniform float p[3][4];
|
||||
uniform int id;
|
||||
uniform int pad[3];
|
||||
float p[3][4];
|
||||
int id;
|
||||
int pad[3];
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
uniform float bounds[2][3];
|
||||
uniform unsigned int offset; // num primitives for leaf, second child for interior
|
||||
uniform unsigned int8 nPrimitives;
|
||||
uniform unsigned int8 splitAxis;
|
||||
uniform unsigned int16 pad;
|
||||
float bounds[2][3];
|
||||
unsigned int offset; // num primitives for leaf, second child for interior
|
||||
unsigned int8 nPrimitives;
|
||||
unsigned int8 splitAxis;
|
||||
unsigned int16 pad;
|
||||
};
|
||||
|
||||
static inline float3 Cross(const float3 v1, const float3 v2) {
|
||||
@@ -88,9 +88,12 @@ static void generateRay(uniform const float raster2camera[4][4],
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
|
||||
camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
|
||||
camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
|
||||
camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
@@ -143,7 +146,7 @@ static bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
|
||||
|
||||
|
||||
static bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
||||
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||
@@ -183,8 +186,8 @@ static bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
}
|
||||
|
||||
|
||||
bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
Ray &r) {
|
||||
bool BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle tris[], Ray &r) {
|
||||
Ray ray = r;
|
||||
bool hit = false;
|
||||
// Follow ray through BVH nodes to find primitive intersections
|
||||
@@ -193,7 +196,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
|
||||
while (true) {
|
||||
// Check ray against BVH node
|
||||
LinearBVHNode node = nodes[nodeNum];
|
||||
uniform LinearBVHNode node = nodes[nodeNum];
|
||||
if (any(BBoxIntersect(node.bounds, ray))) {
|
||||
uniform unsigned int nPrimitives = node.nPrimitives;
|
||||
if (nPrimitives > 0) {
|
||||
@@ -239,8 +242,8 @@ static void raytrace_tile(uniform int x0, uniform int x1,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle triangles[]) {
|
||||
uniform float widthScale = (float)(baseWidth) / (float)(width);
|
||||
uniform float heightScale = (float)(baseHeight) / (float)(height);
|
||||
|
||||
@@ -262,8 +265,8 @@ export void raytrace_ispc(uniform int width, uniform int height,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle triangles[]) {
|
||||
raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
@@ -275,8 +278,8 @@ task void raytrace_tile_task(uniform int width, uniform int height,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle triangles[]) {
|
||||
uniform int dx = 16, dy = 16; // must match dx, dy below
|
||||
uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
uniform int x0 = (taskIndex % xBuckets) * dx;
|
||||
@@ -295,14 +298,14 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle triangles[]) {
|
||||
uniform int dx = 16, dy = 16;
|
||||
uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
uniform int yBuckets = (height + (dy-1)) / dy;
|
||||
uniform int nTasks = xBuckets * yBuckets;
|
||||
launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles) >;
|
||||
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
}
|
||||
|
||||
|
||||
@@ -123,9 +123,12 @@ static void generateRay(const float raster2camera[4][4],
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
|
||||
camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
|
||||
camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
|
||||
camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
|
||||
@@ -88,11 +88,11 @@ loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd) >;
|
||||
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
else
|
||||
launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven) >;
|
||||
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -40,21 +40,68 @@
|
||||
Runtime Requirements" for information about the task-related entrypoints
|
||||
that are implemented here.
|
||||
|
||||
There are three task systems in this file: one built using Microsoft's
|
||||
Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
|
||||
one built on top of bare pthreads.
|
||||
There are several task systems in this file, built using:
|
||||
- Microsoft's Concurrency Runtime (ISPC_USE_CONCRT)
|
||||
- Apple's Grand Central Dispatch (ISPC_USE_GCD)
|
||||
- bare pthreads (ISPC_USE_PTHREADS, ISPC_USE_PTHREADS_FULLY_SUBSCRIBED)
|
||||
- Cilk Plus (ISPC_USE_CILK)
|
||||
- TBB (ISPC_USE_TBB_TASK_GROUP, ISPC_USE_TBB_PARALLEL_FOR)
|
||||
- OpenMP (ISPC_USE_OMP)
|
||||
|
||||
The task system implementation can be selected at compile time, by defining
|
||||
the appropriate preprocessor symbol on the command line (for e.g.: -D ISPC_USE_TBB).
|
||||
Not all combinations of platform and task system are meaningful.
|
||||
If no task system is requested, a reasonable default task system for the platform
|
||||
is selected. Here are the task systems that can be selected:
|
||||
|
||||
#define ISPC_USE_GCD
|
||||
#define ISPC_USE_CONCRT
|
||||
#define ISPC_USE_PTHREADS
|
||||
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
||||
#define ISPC_USE_CILK
|
||||
#define ISPC_USE_OMP
|
||||
#define ISPC_USE_TBB_TASK_GROUP
|
||||
#define ISPC_USE_TBB_PARALLEL_FOR
|
||||
|
||||
The ISPC_USE_PTHREADS_FULLY_SUBSCRIBED model essentially takes over the machine
|
||||
by assigning one pthread to each hyper-thread, and then uses spinlocks and atomics
|
||||
for task management. This model is useful for KNC where tasks can take over
|
||||
the machine, but less so when there are other tasks that need running on the machine.
|
||||
|
||||
#define ISPC_USE_CREW
|
||||
|
||||
*/
|
||||
|
||||
#if !(defined ISPC_USE_CONCRT || defined ISPC_USE_GCD || \
|
||||
defined ISPC_USE_PTHREADS || defined ISPC_USE_PTHREADS_FULLY_SUBSCRIBED || \
|
||||
defined ISPC_USE_TBB_TASK_GROUP || defined ISPC_USE_TBB_PARALLEL_FOR || \
|
||||
defined ISPC_USE_OMP || defined ISPC_USE_CILK )
|
||||
|
||||
// If no task model chosen from the compiler cmdline, pick a reasonable default
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_USE_CONCRT
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_USE_PTHREADS
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_USE_GCD
|
||||
#endif
|
||||
#if defined(__KNC__)
|
||||
#define ISPC_USE_PTHREADS
|
||||
#endif
|
||||
|
||||
#endif // No task model specified on compiler cmdline
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#define ISPC_USE_CONCRT
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#define ISPC_USE_PTHREADS
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#define ISPC_USE_GCD
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
#if defined(__KNC__)
|
||||
#define ISPC_IS_KNC
|
||||
#endif
|
||||
|
||||
|
||||
#define DBG(x)
|
||||
|
||||
@@ -83,9 +130,37 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
#ifdef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
//#include <stdexcept>
|
||||
#include <stack>
|
||||
#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
||||
#ifdef ISPC_USE_TBB_PARALLEL_FOR
|
||||
#include <tbb/parallel_for.h>
|
||||
#endif // ISPC_USE_TBB_PARALLEL_FOR
|
||||
#ifdef ISPC_USE_TBB_TASK_GROUP
|
||||
#include <tbb/task_group.h>
|
||||
#endif // ISPC_USE_TBB_TASK_GROUP
|
||||
#ifdef ISPC_USE_CILK
|
||||
#include <cilk/cilk.h>
|
||||
#endif // ISPC_USE_TBB
|
||||
#ifdef ISPC_USE_OMP
|
||||
#include <omp.h>
|
||||
#endif // ISPC_USE_OMP
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
@@ -107,6 +182,13 @@ struct TaskInfo {
|
||||
#endif
|
||||
};
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
void ISPCSync(void *handle);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// TaskGroupBase
|
||||
|
||||
@@ -181,7 +263,7 @@ inline TaskGroupBase::~TaskGroupBase() {
|
||||
// Note: don't delete memBuffers[0], since it points to the start of
|
||||
// the "mem" member!
|
||||
for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
|
||||
delete[] memBuffers[i];
|
||||
delete[](memBuffers[i]);
|
||||
}
|
||||
|
||||
|
||||
@@ -224,10 +306,10 @@ TaskGroupBase::GetTaskInfo(int index) {
|
||||
inline void *
|
||||
TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
|
||||
char *basePtr = memBuffers[curMemBuffer];
|
||||
int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
|
||||
intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);
|
||||
iptr = (iptr + (alignment-1)) & ~(alignment-1);
|
||||
|
||||
int newOffset = int(iptr + size - (int64_t)basePtr);
|
||||
int newOffset = int(iptr - (intptr_t)basePtr + size);
|
||||
if (newOffset < memBufferSize[curMemBuffer]) {
|
||||
curMemBufferOffset = newOffset;
|
||||
return (char *)iptr;
|
||||
@@ -249,14 +331,6 @@ TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Atomics and the like
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static inline void
|
||||
lMemFence() {
|
||||
__asm__ __volatile__("mfence":::"memory");
|
||||
}
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
|
||||
|
||||
#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
|
||||
#define ISPC_POINTER_BYTES 4
|
||||
#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
|
||||
@@ -266,6 +340,15 @@ lMemFence() {
|
||||
#endif // __SIZEOF_POINTER__
|
||||
|
||||
|
||||
static inline void
|
||||
lMemFence() {
|
||||
// Windows atomic functions already contain the fence
|
||||
// KNC doesn't need the memory barrier
|
||||
#if !defined ISPC_IS_KNC || !defined ISPC_IS_WINDOWS
|
||||
__asm__ __volatile__("mfence":::"memory");
|
||||
#endif
|
||||
}
|
||||
|
||||
static void *
|
||||
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
@@ -288,11 +371,11 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static int32_t
|
||||
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedCompareExchange(v, newValue, oldValue);
|
||||
#else
|
||||
int32_t result;
|
||||
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
@@ -300,9 +383,22 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
|
||||
: "memory");
|
||||
lMemFence();
|
||||
return result;
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
|
||||
static inline int32_t
|
||||
lAtomicAdd(volatile int32_t *v, int32_t delta) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedAdd(v, delta);
|
||||
#else
|
||||
int32_t origValue;
|
||||
__asm__ __volatile__("lock\n"
|
||||
"xaddl %0,%1"
|
||||
: "=r"(origValue), "=m"(*v) : "0"(delta)
|
||||
: "memory");
|
||||
return origValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -366,6 +462,50 @@ private:
|
||||
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
|
||||
#ifdef ISPC_USE_CILK
|
||||
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
|
||||
};
|
||||
|
||||
#endif // ISPC_USE_CILK
|
||||
|
||||
#ifdef ISPC_USE_OMP
|
||||
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
|
||||
};
|
||||
|
||||
#endif // ISPC_USE_OMP
|
||||
|
||||
#ifdef ISPC_USE_TBB_PARALLEL_FOR
|
||||
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
|
||||
};
|
||||
|
||||
#endif // ISPC_USE_TBB_PARALLEL_FOR
|
||||
|
||||
#ifdef ISPC_USE_TBB_TASK_GROUP
|
||||
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
private:
|
||||
tbb::task_group tbbTaskGroup;
|
||||
};
|
||||
|
||||
#endif // ISPC_USE_TBB_TASK_GROUP
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Grand Central Dispatch
|
||||
@@ -487,18 +627,6 @@ static pthread_mutex_t taskSysMutex;
|
||||
static std::vector<TaskGroup *> activeTaskGroups;
|
||||
static sem_t *workerSemaphore;
|
||||
|
||||
|
||||
static inline int32_t
|
||||
lAtomicAdd(int32_t *v, int32_t delta) {
|
||||
int32_t origValue;
|
||||
__asm__ __volatile__("lock\n"
|
||||
"xaddl %0,%1"
|
||||
: "=r"(origValue), "=m"(*v) : "0"(delta)
|
||||
: "memory");
|
||||
return origValue;
|
||||
}
|
||||
|
||||
|
||||
static void *
|
||||
lTaskEntry(void *arg) {
|
||||
int threadIndex = (int)((int64_t)arg);
|
||||
@@ -724,11 +852,15 @@ TaskGroup::Sync() {
|
||||
exit(1);
|
||||
}
|
||||
// FIXME: We basically end up busy-waiting here, which is
|
||||
// extra wasteful in a world with hyperthreading. It would
|
||||
// extra wasteful in a world with hyper-threading. It would
|
||||
// be much better to put this thread to sleep on a
|
||||
// condition variable that was signaled when the last task
|
||||
// in this group was finished.
|
||||
sleep(0);
|
||||
#ifndef ISPC_IS_KNC
|
||||
usleep(1);
|
||||
#else
|
||||
_mm_delay_32(8);
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -772,6 +904,124 @@ TaskGroup::Sync() {
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Cilk Plus
|
||||
|
||||
#ifdef ISPC_USE_CILK
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
// No initialization needed
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
cilk_for(int i = 0; i < count; i++) {
|
||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||
|
||||
// Actually run the task.
|
||||
// Cilk does not expose the task -> thread mapping so we pretend it's 1:1
|
||||
ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount);
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_CILK
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// OpenMP
|
||||
|
||||
#ifdef ISPC_USE_OMP
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
// No initialization needed
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < count; i++) {
|
||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||
|
||||
// Actually run the task.
|
||||
int threadIndex = omp_get_thread_num();
|
||||
int threadCount = omp_get_num_threads();
|
||||
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_OMP
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Thread Building Blocks
|
||||
|
||||
#ifdef ISPC_USE_TBB_PARALLEL_FOR
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
// No initialization needed by default
|
||||
//tbb::task_scheduler_init();
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
tbb::parallel_for(0, count, [=](int i) {
|
||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||
|
||||
// Actually run the task.
|
||||
// TBB does not expose the task -> thread mapping so we pretend it's 1:1
|
||||
int threadIndex = ti->taskIndex;
|
||||
int threadCount = ti->taskCount;
|
||||
|
||||
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
|
||||
});
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_TBB_PARALLEL_FOR
|
||||
|
||||
#ifdef ISPC_USE_TBB_TASK_GROUP
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
// No initialization needed by default
|
||||
//tbb::task_scheduler_init();
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
tbbTaskGroup.run([=]() {
|
||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||
|
||||
// TBB does not expose the task -> thread mapping so we pretend it's 1:1
|
||||
int threadIndex = ti->taskIndex;
|
||||
int threadCount = ti->taskCount;
|
||||
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
tbbTaskGroup.wait();
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_TBB_TASK_GROUP
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
||||
|
||||
#define MAX_FREE_TASK_GROUPS 64
|
||||
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
|
||||
@@ -783,7 +1033,6 @@ AllocTaskGroup() {
|
||||
if (tg != NULL) {
|
||||
void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
|
||||
if (ptr != NULL) {
|
||||
assert(ptr == tg);
|
||||
return (TaskGroup *)ptr;
|
||||
}
|
||||
}
|
||||
@@ -810,13 +1059,6 @@ FreeTaskGroup(TaskGroup *tg) {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
void ISPCSync(void *handle);
|
||||
}
|
||||
|
||||
void
|
||||
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
|
||||
TaskGroup *taskGroup;
|
||||
@@ -863,3 +1105,250 @@ ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
|
||||
|
||||
return taskGroup->AllocMemory(size, alignment);
|
||||
}
|
||||
|
||||
#else // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
||||
|
||||
#define MAX_LIVE_TASKS 1024
|
||||
|
||||
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
// Small structure used to hold the data for each task
|
||||
struct Task {
|
||||
public:
|
||||
TaskFuncType func;
|
||||
void *data;
|
||||
volatile int32_t taskIndex;
|
||||
int taskCount;
|
||||
|
||||
volatile int numDone;
|
||||
int liveIndex; // index in live task queue
|
||||
|
||||
inline int noMoreWork() { return taskIndex >= taskCount; }
|
||||
/*! given thread is done working on this task --> decrease num locks */
|
||||
// inline void lock() { lAtomicAdd(&locks,1); }
|
||||
// inline void unlock() { lAtomicAdd(&locks,-1); }
|
||||
inline int nextJob() { return lAtomicAdd(&taskIndex,1); }
|
||||
inline int numJobs() { return taskCount; }
|
||||
inline void schedule(int idx) { taskIndex = 0; numDone = 0; liveIndex = idx; }
|
||||
inline void run(int idx, int threadIdx);
|
||||
inline void markOneDone() { lAtomicAdd(&numDone,1); }
|
||||
inline void wait()
|
||||
{
|
||||
while (!noMoreWork()) {
|
||||
int next = nextJob();
|
||||
if (next < numJobs()) run(next, 0);
|
||||
}
|
||||
while (numDone != taskCount) {
|
||||
#ifndef ISPC_IS_KNC
|
||||
usleep(1);
|
||||
#else
|
||||
_mm_delay_32(8);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
class TaskSys {
|
||||
static int numThreadsRunning;
|
||||
struct LiveTask
|
||||
{
|
||||
volatile int locks; /*!< num locks on this task. gets
|
||||
initialized to NUM_THREADS+1, then counted
|
||||
down by every thread that sees this. this
|
||||
value is only valid when 'active' is set
|
||||
to true */
|
||||
volatile int active; /*! workers will spin on this until it
|
||||
becomes active */
|
||||
Task *task;
|
||||
|
||||
inline void doneWithThis() { lAtomicAdd(&locks,-1); }
|
||||
LiveTask() : active(0), locks(-1) {}
|
||||
};
|
||||
|
||||
public:
|
||||
volatile int nextScheduleIndex; /*! next index in the task queue
|
||||
where we'll insert a live task */
|
||||
|
||||
// inline int inc_begin() { int old = begin; begin = (begin+1)%MAX_TASKS; return old; }
|
||||
// inline int inc_end() { int old = end; end = (end+1)%MAX_TASKS; return old; }
|
||||
|
||||
LiveTask taskQueue[MAX_LIVE_TASKS];
|
||||
std::stack<Task *> taskMem;
|
||||
|
||||
static TaskSys *global;
|
||||
|
||||
TaskSys() : nextScheduleIndex(0)
|
||||
{
|
||||
TaskSys::global = this;
|
||||
Task *mem = new Task[MAX_LIVE_TASKS]; //< could actually be more than _live_ tasks
|
||||
for (int i=0;i<MAX_LIVE_TASKS;i++) {
|
||||
taskMem.push(mem+i);
|
||||
}
|
||||
createThreads();
|
||||
}
|
||||
|
||||
inline Task *allocOne()
|
||||
{
|
||||
pthread_mutex_lock(&mutex);
|
||||
if (taskMem.empty()) {
|
||||
fprintf(stderr, "Too many live tasks. "
|
||||
"Change the value of MAX_LIVE_TASKS and recompile.\n");
|
||||
exit(1);
|
||||
}
|
||||
Task *task = taskMem.top();
|
||||
taskMem.pop();
|
||||
pthread_mutex_unlock(&mutex);
|
||||
return task;
|
||||
}
|
||||
|
||||
static inline void init()
|
||||
{
|
||||
if (global) return;
|
||||
pthread_mutex_lock(&mutex);
|
||||
if (global == NULL) global = new TaskSys;
|
||||
pthread_mutex_unlock(&mutex);
|
||||
}
|
||||
|
||||
void createThreads();
|
||||
int nThreads;
|
||||
pthread_t *thread;
|
||||
|
||||
void threadFct();
|
||||
|
||||
inline void schedule(Task *t)
|
||||
{
|
||||
pthread_mutex_lock(&mutex);
|
||||
int liveIndex = nextScheduleIndex;
|
||||
nextScheduleIndex = (nextScheduleIndex+1)%MAX_LIVE_TASKS;
|
||||
if (taskQueue[liveIndex].active) {
|
||||
fprintf(stderr, "Out of task queue resources. "
|
||||
"Change the value of MAX_LIVE_TASKS and recompile.\n");
|
||||
exit(1);
|
||||
}
|
||||
taskQueue[liveIndex].task = t;
|
||||
t->schedule(liveIndex);
|
||||
taskQueue[liveIndex].locks = numThreadsRunning+1; // num _worker_ threads plus creator
|
||||
taskQueue[liveIndex].active = true;
|
||||
pthread_mutex_unlock(&mutex);
|
||||
}
|
||||
|
||||
void sync(Task *task)
|
||||
{
|
||||
task->wait();
|
||||
int liveIndex = task->liveIndex;
|
||||
while (taskQueue[liveIndex].locks > 1) {
|
||||
#ifndef ISPC_IS_KNC
|
||||
usleep(1);
|
||||
#else
|
||||
_mm_delay_32(8);
|
||||
#endif
|
||||
}
|
||||
_mm_free(task->data);
|
||||
pthread_mutex_lock(&mutex);
|
||||
taskMem.push(task); // recycle task index
|
||||
taskQueue[liveIndex].active = false;
|
||||
pthread_mutex_unlock(&mutex);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void TaskSys::threadFct()
|
||||
{
|
||||
int myIndex = 0; //lAtomicAdd(&threadIdx,1);
|
||||
while (1) {
|
||||
while (!taskQueue[myIndex].active) {
|
||||
#ifndef ISPC_IS_KNC
|
||||
usleep(4);
|
||||
#else
|
||||
_mm_delay_32(32);
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
|
||||
Task *mine = taskQueue[myIndex].task;
|
||||
while (!mine->noMoreWork()) {
|
||||
int job = mine->nextJob();
|
||||
if (job >= mine->numJobs()) break;
|
||||
mine->run(job,myIndex);
|
||||
}
|
||||
taskQueue[myIndex].doneWithThis();
|
||||
myIndex = (myIndex+1)%MAX_LIVE_TASKS;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void Task::run(int idx, int threadIdx) {
|
||||
(*this->func)(data,threadIdx,TaskSys::global->nThreads,idx,taskCount);
|
||||
markOneDone();
|
||||
}
|
||||
|
||||
|
||||
void *_threadFct(void *data) {
|
||||
((TaskSys*)data)->threadFct();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
void TaskSys::createThreads()
|
||||
{
|
||||
init();
|
||||
int reserved = 4;
|
||||
int minid = 2;
|
||||
nThreads = sysconf(_SC_NPROCESSORS_ONLN) - reserved;
|
||||
|
||||
thread = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
|
||||
|
||||
numThreadsRunning = 0;
|
||||
for (int i = 0; i < nThreads; ++i) {
|
||||
pthread_attr_t attr;
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setstacksize(&attr, 2*1024 * 1024);
|
||||
|
||||
int threadID = minid+i;
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(threadID,&cpuset);
|
||||
int ret = pthread_attr_setaffinity_np(&attr,sizeof(cpuset),&cpuset);
|
||||
|
||||
int err = pthread_create(&thread[i], &attr, &_threadFct, this);
|
||||
++numThreadsRunning;
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TaskSys * TaskSys::global = NULL;
|
||||
int TaskSys::numThreadsRunning = 0;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count)
|
||||
{
|
||||
Task *ti = *(Task**)taskGroupPtr;
|
||||
ti->func = (TaskFuncType)func;
|
||||
ti->data = data;
|
||||
ti->taskIndex = 0;
|
||||
ti->taskCount = count;
|
||||
TaskSys::global->schedule(ti);
|
||||
}
|
||||
|
||||
void ISPCSync(void *h)
|
||||
{
|
||||
Task *task = (Task *)h;
|
||||
assert(task);
|
||||
TaskSys::global->sync(task);
|
||||
}
|
||||
|
||||
void *ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
|
||||
{
|
||||
TaskSys::init();
|
||||
Task *task = TaskSys::global->allocOne();
|
||||
*taskGroupPtr = task;
|
||||
task->data = _mm_malloc(size,alignment);
|
||||
return task->data;//*taskGroupPtr;
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
||||
|
||||
@@ -43,9 +43,15 @@ extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
__inline__ uint64_t rdtsc() {
|
||||
uint32_t low, high;
|
||||
#ifdef __x86_64
|
||||
__asm__ __volatile__ (
|
||||
"xorl %%eax,%%eax \n cpuid"
|
||||
::: "%rax", "%rbx", "%rcx", "%rdx" );
|
||||
#else
|
||||
__asm__ __volatile__ (
|
||||
"xorl %%eax,%%eax \n cpuid"
|
||||
::: "%eax", "%ebx", "%ecx", "%edx" );
|
||||
#endif
|
||||
__asm__ __volatile__ (
|
||||
"rdtsc" : "=a" (low), "=d" (high));
|
||||
return (uint64_t)high << 32 | low;
|
||||
|
||||
@@ -336,6 +336,6 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
|
||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||
uniform int dx = 8, dy = 8;
|
||||
uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||
launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image) >;
|
||||
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
}
|
||||
|
||||
71
expr.h
71
expr.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -284,6 +284,10 @@ public:
|
||||
int EstimateCost() const;
|
||||
|
||||
Expr *baseExpr, *index;
|
||||
|
||||
private:
|
||||
mutable const Type *type;
|
||||
mutable const PointerType *lvalueType;
|
||||
};
|
||||
|
||||
|
||||
@@ -299,7 +303,6 @@ public:
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
const Type *GetLValueType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
void Print() const;
|
||||
Expr *Optimize();
|
||||
@@ -321,6 +324,9 @@ public:
|
||||
member is found. (i.e. this is true if the MemberExpr was a '->'
|
||||
operator, and is false if it was a '.' operator. */
|
||||
bool dereferenceExpr;
|
||||
|
||||
protected:
|
||||
mutable const Type *type, *lvalueType;
|
||||
};
|
||||
|
||||
|
||||
@@ -531,26 +537,48 @@ public:
|
||||
};
|
||||
|
||||
|
||||
/** @brief Expression that represents dereferencing a reference to get its
|
||||
value. */
|
||||
class DereferenceExpr : public Expr {
|
||||
/** @brief Common base class that provides shared functionality for
|
||||
PtrDerefExpr and RefDerefExpr. */
|
||||
class DerefExpr : public Expr {
|
||||
public:
|
||||
DereferenceExpr(Expr *e, SourcePos p);
|
||||
DerefExpr(Expr *e, SourcePos p);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
const Type *GetLValueType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
|
||||
Expr *expr;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Expression that represents dereferencing a pointer to get its
|
||||
value. */
|
||||
class PtrDerefExpr : public DerefExpr {
|
||||
public:
|
||||
PtrDerefExpr(Expr *e, SourcePos p);
|
||||
|
||||
const Type *GetType() const;
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Expression that represents dereferencing a reference to get its
|
||||
value. */
|
||||
class RefDerefExpr : public DerefExpr {
|
||||
public:
|
||||
RefDerefExpr(Expr *e, SourcePos p);
|
||||
|
||||
const Type *GetType() const;
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
};
|
||||
|
||||
|
||||
/** Expression that represents taking the address of an expression. */
|
||||
class AddressOfExpr : public Expr {
|
||||
public:
|
||||
@@ -563,6 +591,7 @@ public:
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
llvm::Constant *GetConstant(const Type *type) const;
|
||||
|
||||
Expr *expr;
|
||||
};
|
||||
@@ -630,20 +659,26 @@ public:
|
||||
function overloading, this method resolves which actual function
|
||||
the arguments match best. If the argCouldBeNULL parameter is
|
||||
non-NULL, each element indicates whether the corresponding argument
|
||||
is the number zero, indicating that it could be a NULL pointer.
|
||||
This parameter may be NULL (for cases where overload resolution is
|
||||
being done just given type information without the parameter
|
||||
argument expressions being available. It returns true on success.
|
||||
is the number zero, indicating that it could be a NULL pointer, and
|
||||
if argIsConstant is non-NULL, each element indicates whether the
|
||||
corresponding argument is a compile-time constant value. Both of
|
||||
these parameters may be NULL (for cases where overload resolution
|
||||
is being done just given type information without the parameter
|
||||
argument expressions being available. This function returns true
|
||||
on success.
|
||||
*/
|
||||
bool ResolveOverloads(SourcePos argPos,
|
||||
const std::vector<const Type *> &argTypes,
|
||||
const std::vector<bool> *argCouldBeNULL = NULL);
|
||||
const std::vector<bool> *argCouldBeNULL = NULL,
|
||||
const std::vector<bool> *argIsConstant = NULL);
|
||||
Symbol *GetMatchingFunction();
|
||||
|
||||
private:
|
||||
bool tryResolve(int (*matchFunc)(const Type *, const Type *),
|
||||
SourcePos argPos, const std::vector<const Type *> &argTypes,
|
||||
const std::vector<bool> *argCouldBeNULL);
|
||||
std::vector<Symbol *> getCandidateFunctions(int argCount) const;
|
||||
static int computeOverloadCost(const FunctionType *ftype,
|
||||
const std::vector<const Type *> &argTypes,
|
||||
const std::vector<bool> *argCouldBeNULL,
|
||||
const std::vector<bool> *argIsConstant);
|
||||
|
||||
/** Name of the function that is being called. */
|
||||
std::string name;
|
||||
|
||||
140
func.cpp
140
func.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -46,12 +46,21 @@
|
||||
#include "util.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#else
|
||||
#include <llvm/IR/LLVMContext.h>
|
||||
#include <llvm/IR/Module.h>
|
||||
#include <llvm/IR/Type.h>
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#include <llvm/IR/Intrinsics.h>
|
||||
#include <llvm/IR/DerivedTypes.h>
|
||||
#endif
|
||||
#include <llvm/PassManager.h>
|
||||
#include <llvm/PassRegistry.h>
|
||||
#include <llvm/Transforms/IPO.h>
|
||||
@@ -59,16 +68,14 @@
|
||||
#include <llvm/Support/FileUtilities.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/Target/TargetOptions.h>
|
||||
#include <llvm/Target/TargetData.h>
|
||||
#include <llvm/PassManager.h>
|
||||
#include <llvm/Analysis/Verifier.h>
|
||||
#include <llvm/Support/CFG.h>
|
||||
#include <llvm/Support/ToolOutputFile.h>
|
||||
#include <llvm/Assembly/PrintModulePass.h>
|
||||
|
||||
Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
|
||||
Function::Function(Symbol *s, Stmt *c) {
|
||||
sym = s;
|
||||
args = a;
|
||||
code = c;
|
||||
|
||||
maskSymbol = m->symbolTable->LookupVariable("__mask");
|
||||
@@ -101,12 +108,20 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
|
||||
printf("\n\n\n");
|
||||
}
|
||||
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
const FunctionType *type = CastType<FunctionType>(sym->type);
|
||||
Assert(type != NULL);
|
||||
|
||||
for (unsigned int i = 0; i < args.size(); ++i)
|
||||
if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
|
||||
args[i]->parentFunction = this;
|
||||
for (int i = 0; i < type->GetNumParameters(); ++i) {
|
||||
const char *paramName = type->GetParameterName(i).c_str();
|
||||
Symbol *sym = m->symbolTable->LookupVariable(paramName);
|
||||
if (sym == NULL)
|
||||
Assert(strncmp(paramName, "__anon_parameter_", 17) == 0);
|
||||
args.push_back(sym);
|
||||
|
||||
const Type *t = type->GetParameterType(i);
|
||||
if (sym != NULL && CastType<ReferenceType>(t) == NULL)
|
||||
sym->parentFunction = this;
|
||||
}
|
||||
|
||||
if (type->isTask) {
|
||||
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
|
||||
@@ -125,7 +140,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
|
||||
|
||||
const Type *
|
||||
Function::GetReturnType() const {
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
const FunctionType *type = CastType<FunctionType>(sym->type);
|
||||
Assert(type != NULL);
|
||||
return type->GetReturnType();
|
||||
}
|
||||
@@ -133,7 +148,7 @@ Function::GetReturnType() const {
|
||||
|
||||
const FunctionType *
|
||||
Function::GetType() const {
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
const FunctionType *type = CastType<FunctionType>(sym->type);
|
||||
Assert(type != NULL);
|
||||
return type;
|
||||
}
|
||||
@@ -145,7 +160,8 @@ Function::GetType() const {
|
||||
'mem2reg' pass will in turn promote to SSA registers..
|
||||
*/
|
||||
static void
|
||||
lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
|
||||
lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const
|
||||
std::vector<Symbol *> &args,
|
||||
FunctionEmitContext *ctx) {
|
||||
// We expect the argument structure to come in as a poitner to a
|
||||
// structure. Confirm and figure out its type here.
|
||||
@@ -157,9 +173,13 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
|
||||
llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
|
||||
|
||||
// Get the type of the argument we're copying in and its Symbol pointer
|
||||
LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
|
||||
llvm::Type *argType = argStructType->getElementType(i);
|
||||
Symbol *sym = args[i];
|
||||
|
||||
if (sym == NULL)
|
||||
// anonymous parameter, so don't worry about it
|
||||
return;
|
||||
|
||||
// allocate space to copy the parameter in to
|
||||
sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
|
||||
|
||||
@@ -170,7 +190,7 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
|
||||
// memory
|
||||
llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
|
||||
ctx->StoreInst(ptrval, sym->storagePtr);
|
||||
ctx->EmitFunctionParameterDebugInfo(sym);
|
||||
ctx->EmitFunctionParameterDebugInfo(sym, i);
|
||||
}
|
||||
|
||||
|
||||
@@ -186,14 +206,14 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
// value
|
||||
maskSymbol->storagePtr = ctx->GetFullMaskPointer();
|
||||
|
||||
// add debugging info for __mask, programIndex, ...
|
||||
// add debugging info for __mask
|
||||
maskSymbol->pos = firstStmtPos;
|
||||
ctx->EmitVariableDebugInfo(maskSymbol);
|
||||
|
||||
#if 0
|
||||
llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
|
||||
#endif
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
const FunctionType *type = CastType<FunctionType>(sym->type);
|
||||
Assert(type != NULL);
|
||||
if (type->isTask == true) {
|
||||
// For tasks, we there should always be three parmeters: the
|
||||
@@ -211,13 +231,15 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
for (unsigned int i = 0; i < args.size(); ++i)
|
||||
lCopyInTaskParameter(i, structParamPtr, args, ctx);
|
||||
|
||||
// Copy in the mask as well.
|
||||
int nArgs = (int)args.size();
|
||||
// The mask is the last parameter in the argument structure
|
||||
llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
|
||||
"task_struct_mask");
|
||||
llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
|
||||
ctx->SetFunctionMask(ptrval);
|
||||
if (type->isUnmasked == false) {
|
||||
// Copy in the mask as well.
|
||||
int nArgs = (int)args.size();
|
||||
// The mask is the last parameter in the argument structure
|
||||
llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
|
||||
"task_struct_mask");
|
||||
llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
|
||||
ctx->SetFunctionMask(ptrval);
|
||||
}
|
||||
|
||||
// Copy threadIndex and threadCount into stack-allocated storage so
|
||||
// that their symbols point to something reasonable.
|
||||
@@ -240,13 +262,17 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
llvm::Function::arg_iterator argIter = function->arg_begin();
|
||||
for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
|
||||
Symbol *sym = args[i];
|
||||
if (sym == NULL)
|
||||
// anonymous function parameter
|
||||
continue;
|
||||
|
||||
argIter->setName(sym->name.c_str());
|
||||
|
||||
// Allocate stack storage for the parameter and emit code
|
||||
// to store the its value there.
|
||||
sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
|
||||
ctx->StoreInst(argIter, sym->storagePtr);
|
||||
ctx->EmitFunctionParameterDebugInfo(sym);
|
||||
ctx->EmitFunctionParameterDebugInfo(sym, i);
|
||||
}
|
||||
|
||||
// If the number of actual function arguments is equal to the
|
||||
@@ -254,9 +280,13 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
// don't have a mask parameter, so set it to be all on. This
|
||||
// happens for exmaple with 'export'ed functions that the app
|
||||
// calls.
|
||||
if (argIter == function->arg_end())
|
||||
if (argIter == function->arg_end()) {
|
||||
Assert(type->isUnmasked || type->isExported);
|
||||
ctx->SetFunctionMask(LLVMMaskAllOn);
|
||||
}
|
||||
else {
|
||||
Assert(type->isUnmasked == false);
|
||||
|
||||
// Otherwise use the mask to set the entry mask value
|
||||
argIter->setName("__mask");
|
||||
Assert(argIter->getType() == LLVMTypes::MaskType);
|
||||
@@ -279,21 +309,30 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
// on, all off, or mixed. If this is a simple function, then this
|
||||
// isn't worth the code bloat / overhead.
|
||||
bool checkMask = (type->isTask == true) ||
|
||||
((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
|
||||
(
|
||||
#if defined(LLVM_3_1)
|
||||
(function->hasFnAttr(llvm::Attribute::AlwaysInline) == false)
|
||||
#elif defined(LLVM_3_2)
|
||||
(function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false)
|
||||
#else // LLVM 3.3+
|
||||
(function->getAttributes().getFnAttributes().hasAttribute(llvm::Attribute::AlwaysInline) == false)
|
||||
#endif
|
||||
&&
|
||||
costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
|
||||
checkMask &= (type->isUnmasked == false);
|
||||
checkMask &= (g->target.maskingIsFree == false);
|
||||
checkMask &= (g->opt.disableCoherentControlFlow == false);
|
||||
|
||||
|
||||
if (checkMask) {
|
||||
llvm::Value *mask = ctx->GetFunctionMask();
|
||||
llvm::Value *allOn = ctx->All(mask);
|
||||
llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
|
||||
llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");
|
||||
llvm::BasicBlock *bbSomeOn = ctx->CreateBasicBlock("some_on");
|
||||
|
||||
// Set up basic blocks for goto targets
|
||||
ctx->InitializeLabelMap(code);
|
||||
|
||||
ctx->BranchInst(bbAllOn, bbNotAll, allOn);
|
||||
ctx->BranchInst(bbAllOn, bbSomeOn, allOn);
|
||||
// all on: we've determined dynamically that the mask is all
|
||||
// on. Set the current mask to "all on" explicitly so that
|
||||
// codegen for this path can be improved with this knowledge in
|
||||
@@ -305,23 +344,11 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
if (ctx->GetCurrentBasicBlock())
|
||||
ctx->ReturnInst();
|
||||
|
||||
// not all on: figure out if no instances are running, or if
|
||||
// some of them are
|
||||
ctx->SetCurrentBasicBlock(bbNotAll);
|
||||
ctx->SetFunctionMask(mask);
|
||||
llvm::BasicBlock *bbNoneOn = ctx->CreateBasicBlock("none_on");
|
||||
llvm::BasicBlock *bbSomeOn = ctx->CreateBasicBlock("some_on");
|
||||
llvm::Value *anyOn = ctx->Any(mask);
|
||||
ctx->BranchInst(bbSomeOn, bbNoneOn, anyOn);
|
||||
|
||||
// Everyone is off; get out of here.
|
||||
ctx->SetCurrentBasicBlock(bbNoneOn);
|
||||
ctx->ReturnInst();
|
||||
|
||||
// some on: reset the mask to the value it had at function
|
||||
// entry and emit the code. Resetting the mask here is
|
||||
// important, due to the "all on" setting of it for the path
|
||||
// above
|
||||
// not all on: however, at least one lane must be running,
|
||||
// since we should never run with all off... some on: reset
|
||||
// the mask to the value it had at function entry and emit the
|
||||
// code. Resetting the mask here is important, due to the "all
|
||||
// on" setting of it for the path above.
|
||||
ctx->SetCurrentBasicBlock(bbSomeOn);
|
||||
ctx->SetFunctionMask(mask);
|
||||
|
||||
@@ -355,7 +382,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
// issue a warning. Also need to warn if it's the entry block for
|
||||
// the function (in which case it will not have predeccesors but is
|
||||
// still reachable.)
|
||||
if (type->GetReturnType() != AtomicType::Void &&
|
||||
if (Type::Equal(type->GetReturnType(), AtomicType::Void) == false &&
|
||||
(pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
|
||||
Warning(sym->pos, "Missing return statement in function returning \"%s\".",
|
||||
type->rType->GetString().c_str());
|
||||
@@ -415,19 +442,22 @@ Function::GenerateIR() {
|
||||
// If the function is 'export'-qualified, emit a second version of
|
||||
// it without a mask parameter and without name mangling so that
|
||||
// the application can call it
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
const FunctionType *type = CastType<FunctionType>(sym->type);
|
||||
Assert(type != NULL);
|
||||
if (type->isExported) {
|
||||
if (!type->isTask) {
|
||||
LLVM_TYPE_CONST llvm::FunctionType *ftype =
|
||||
type->LLVMFunctionType(g->ctx);
|
||||
llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
|
||||
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
|
||||
std::string functionName = sym->name;
|
||||
if (g->mangleFunctionsWithTarget)
|
||||
functionName += std::string("_") + g->target.GetISAString();
|
||||
llvm::Function *appFunction =
|
||||
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
|
||||
#if defined(LLVM_3_1)
|
||||
appFunction->setDoesNotThrow(true);
|
||||
#else
|
||||
appFunction->setDoesNotThrow();
|
||||
#endif
|
||||
|
||||
if (appFunction->getName() != functionName) {
|
||||
// this was a redefinition for which we already emitted an
|
||||
|
||||
4
func.h
4
func.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -43,7 +43,7 @@
|
||||
|
||||
class Function {
|
||||
public:
|
||||
Function(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code);
|
||||
Function(Symbol *sym, Stmt *code);
|
||||
|
||||
const Type *GetReturnType() const;
|
||||
const FunctionType *GetType() const;
|
||||
|
||||
412
ispc.cpp
412
ispc.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -41,27 +41,41 @@
|
||||
#include "llvmutil.h"
|
||||
#include <stdio.h>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#include <windows.h>
|
||||
#include <direct.h>
|
||||
#define strcasecmp stricmp
|
||||
#include <windows.h>
|
||||
#include <direct.h>
|
||||
#define strcasecmp stricmp
|
||||
#else
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#else
|
||||
#include <llvm/IR/LLVMContext.h>
|
||||
#include <llvm/IR/Module.h>
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#endif
|
||||
#if defined(LLVM_3_1)
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#else
|
||||
#include <llvm/DebugInfo.h>
|
||||
#include <llvm/DIBuilder.h>
|
||||
#endif
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
#include <llvm/Support/Dwarf.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/Target/TargetOptions.h>
|
||||
#include <llvm/Target/TargetData.h>
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#include <llvm/Support/TargetRegistry.h>
|
||||
#include <llvm/Support/TargetSelect.h>
|
||||
#else
|
||||
#include <llvm/Target/TargetRegistry.h>
|
||||
#include <llvm/Target/TargetSelect.h>
|
||||
#include <llvm/Target/SubtargetFeature.h>
|
||||
#if defined(LLVM_3_1)
|
||||
#include <llvm/Target/TargetData.h>
|
||||
#elif defined(LLVM_3_2)
|
||||
#include <llvm/DataLayout.h>
|
||||
#else // LLVM 3.3+
|
||||
#include <llvm/IR/DataLayout.h>
|
||||
#endif
|
||||
#include <llvm/Support/TargetRegistry.h>
|
||||
#include <llvm/Support/TargetSelect.h>
|
||||
#include <llvm/Support/Host.h>
|
||||
|
||||
Globals *g;
|
||||
@@ -70,31 +84,124 @@ Module *m;
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Target
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static void __cpuid(int info[4], int infoType) {
|
||||
__asm__ __volatile__ ("cpuid"
|
||||
: "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
: "0" (infoType));
|
||||
}
|
||||
|
||||
/* Save %ebx in case it's the PIC register */
|
||||
static void __cpuidex(int info[4], int level, int count) {
|
||||
__asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
|
||||
"cpuid\n\t"
|
||||
"xchg{l}\t{%%}ebx, %1\n\t"
|
||||
: "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
: "0" (level), "2" (count));
|
||||
}
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
|
||||
|
||||
static const char *
|
||||
lGetSystemISA() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
|
||||
if ((info[2] & (1 << 28)) != 0) { // AVX
|
||||
// AVX1 for sure....
|
||||
// Ivy Bridge?
|
||||
if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
(info[2] & (1 << 30)) != 0) { // RDRAND
|
||||
// So far, so good. AVX2?
|
||||
// Call cpuid with eax=7, ecx=0
|
||||
int info2[4];
|
||||
__cpuidex(info2, 7, 0);
|
||||
if ((info2[1] & (1 << 5)) != 0)
|
||||
return "avx2";
|
||||
else
|
||||
return "avx1.1";
|
||||
}
|
||||
// Regular AVX
|
||||
return "avx";
|
||||
}
|
||||
else if ((info[2] & (1 << 19)) != 0)
|
||||
return "sse4";
|
||||
else if ((info[3] & (1 << 26)) != 0)
|
||||
return "sse2";
|
||||
else {
|
||||
fprintf(stderr, "Unable to detect supported SSE/AVX ISA. Exiting.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static const char *supportedCPUs[] = {
|
||||
"atom", "penryn", "core2", "corei7", "corei7-avx"
|
||||
#if defined(LLVM_3_2) || defined(LLVM_3_3)
|
||||
, "core-avx-i", "core-avx2"
|
||||
#endif // LLVM_3_2 or LLVM_3_3
|
||||
};
|
||||
|
||||
|
||||
bool
|
||||
Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
bool pic, Target *t) {
|
||||
if (isa == NULL) {
|
||||
if (cpu != NULL) {
|
||||
// If a CPU was specified explicitly, try to pick the best
|
||||
// possible ISA based on that.
|
||||
if (!strcmp(cpu, "core-avx2"))
|
||||
isa = "avx2";
|
||||
else if (!strcmp(cpu, "core-avx-i"))
|
||||
isa = "avx1.1";
|
||||
else if (!strcmp(cpu, "sandybridge") ||
|
||||
!strcmp(cpu, "corei7-avx"))
|
||||
isa = "avx";
|
||||
else if (!strcmp(cpu, "corei7") ||
|
||||
!strcmp(cpu, "penryn"))
|
||||
isa = "sse4";
|
||||
else
|
||||
isa = "sse2";
|
||||
Warning(SourcePos(), "No --target specified on command-line. "
|
||||
"Using ISA \"%s\" based on specified CPU \"%s\".", isa,
|
||||
cpu);
|
||||
}
|
||||
else {
|
||||
// No CPU and no ISA, so use CPUID to figure out what this CPU
|
||||
// supports.
|
||||
isa = lGetSystemISA();
|
||||
Warning(SourcePos(), "No --target specified on command-line. "
|
||||
"Using system ISA \"%s\".", isa);
|
||||
}
|
||||
}
|
||||
|
||||
if (cpu == NULL) {
|
||||
std::string hostCPU = llvm::sys::getHostCPUName();
|
||||
if (hostCPU.size() > 0)
|
||||
cpu = strdup(hostCPU.c_str());
|
||||
else {
|
||||
fprintf(stderr, "Warning: unable to determine host CPU!\n");
|
||||
Warning(SourcePos(), "Unable to determine host CPU!\n");
|
||||
cpu = "generic";
|
||||
}
|
||||
}
|
||||
else {
|
||||
bool foundCPU = false;
|
||||
for (int i = 0; i < int(sizeof(supportedCPUs) / sizeof(supportedCPUs[0]));
|
||||
++i) {
|
||||
if (!strcmp(cpu, supportedCPUs[i])) {
|
||||
foundCPU = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (foundCPU == false) {
|
||||
fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
|
||||
"%s.\n", cpu, SupportedTargetCPUs().c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
t->cpu = cpu;
|
||||
|
||||
if (isa == NULL) {
|
||||
if (!strcasecmp(cpu, "atom"))
|
||||
isa = "sse2";
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
else if (!strcasecmp(cpu, "sandybridge") ||
|
||||
!strcasecmp(cpu, "corei7-avx"))
|
||||
isa = "avx";
|
||||
#endif // LLVM_3_0
|
||||
else
|
||||
isa = "sse4";
|
||||
}
|
||||
if (arch == NULL)
|
||||
arch = "x86-64";
|
||||
|
||||
@@ -125,13 +232,16 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->arch = arch;
|
||||
}
|
||||
|
||||
// This is the case for most of them
|
||||
t->hasHalf = t->hasRand = t->hasTranscendentals = false;
|
||||
t->hasGather = t->hasScatter = false;
|
||||
|
||||
if (!strcasecmp(isa, "sse2")) {
|
||||
t->isa = Target::SSE2;
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 4;
|
||||
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse2-x2")) {
|
||||
@@ -140,7 +250,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse4")) {
|
||||
@@ -149,7 +258,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->vectorWidth = 4;
|
||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
|
||||
@@ -158,7 +266,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-4")) {
|
||||
@@ -166,73 +273,136 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 4;
|
||||
t->maskingIsFree = true;
|
||||
t->allOffMaskIsSafe = true;
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-8")) {
|
||||
t->isa = Target::GENERIC;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 8;
|
||||
t->maskingIsFree = true;
|
||||
t->allOffMaskIsSafe = true;
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-16")) {
|
||||
t->isa = Target::GENERIC;
|
||||
t->nativeVectorWidth = 16;
|
||||
t->vectorWidth = 16;
|
||||
t->maskingIsFree = true;
|
||||
t->allOffMaskIsSafe = true;
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-32")) {
|
||||
t->isa = Target::GENERIC;
|
||||
t->nativeVectorWidth = 32;
|
||||
t->vectorWidth = 32;
|
||||
t->maskingIsFree = true;
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-64")) {
|
||||
t->isa = Target::GENERIC;
|
||||
t->nativeVectorWidth = 64;
|
||||
t->vectorWidth = 64;
|
||||
t->maskingIsFree = true;
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-1")) {
|
||||
t->isa = Target::GENERIC;
|
||||
t->nativeVectorWidth = 1;
|
||||
t->vectorWidth = 1;
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
}
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
else if (!strcasecmp(isa, "avx")) {
|
||||
else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) {
|
||||
t->isa = Target::AVX;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+avx,+popcnt,+cmov";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
}
|
||||
else if (!strcasecmp(isa, "avx-x2")) {
|
||||
else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) {
|
||||
t->isa = Target::AVX;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 16;
|
||||
t->attributes = "+avx,+popcnt,+cmov";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
}
|
||||
#endif // LLVM 3.0+
|
||||
#if defined(LLVM_3_1svn)
|
||||
else if (!strcasecmp(isa, "avx1.1")) {
|
||||
t->isa = Target::AVX11;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
|
||||
t->maskingIsFree = false;
|
||||
t->maskBitCount = 32;
|
||||
t->hasHalf = true;
|
||||
#if !defined(LLVM_3_1)
|
||||
// LLVM 3.2+ only
|
||||
t->hasRand = true;
|
||||
#endif
|
||||
}
|
||||
else if (!strcasecmp(isa, "avx1.1-x2")) {
|
||||
t->isa = Target::AVX11;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 16;
|
||||
t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
|
||||
t->maskingIsFree = false;
|
||||
t->maskBitCount = 32;
|
||||
t->hasHalf = true;
|
||||
#if !defined(LLVM_3_1)
|
||||
// LLVM 3.2+ only
|
||||
t->hasRand = true;
|
||||
#endif
|
||||
}
|
||||
else if (!strcasecmp(isa, "avx2")) {
|
||||
t->isa = Target::AVX2;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
|
||||
t->attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
|
||||
#ifndef LLVM_3_1
|
||||
",+fma"
|
||||
#endif // !LLVM_3_1
|
||||
;
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
t->hasHalf = true;
|
||||
#if !defined(LLVM_3_1)
|
||||
// LLVM 3.2+ only
|
||||
t->hasRand = true;
|
||||
t->hasGather = true;
|
||||
#endif
|
||||
}
|
||||
else if (!strcasecmp(isa, "avx2-x2")) {
|
||||
t->isa = Target::AVX2;
|
||||
t->nativeVectorWidth = 16;
|
||||
t->vectorWidth = 16;
|
||||
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
|
||||
t->attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
|
||||
#ifndef LLVM_3_1
|
||||
",+fma"
|
||||
#endif // !LLVM_3_1
|
||||
;
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
t->hasHalf = true;
|
||||
#if !defined(LLVM_3_1)
|
||||
// LLVM 3.2+ only
|
||||
t->hasRand = true;
|
||||
t->hasGather = true;
|
||||
#endif
|
||||
}
|
||||
#endif // LLVM 3.1
|
||||
else {
|
||||
fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n",
|
||||
isa, SupportedTargetISAs());
|
||||
@@ -241,25 +411,31 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
|
||||
if (!error) {
|
||||
llvm::TargetMachine *targetMachine = t->GetTargetMachine();
|
||||
#if defined(LLVM_3_1)
|
||||
const llvm::TargetData *targetData = targetMachine->getTargetData();
|
||||
t->is32Bit = (targetData->getPointerSize() == 4);
|
||||
#else
|
||||
int addressSpace = 0;
|
||||
const llvm::DataLayout *dataLayout = targetMachine->getDataLayout();
|
||||
t->is32Bit = (dataLayout->getPointerSize(addressSpace) == 4);
|
||||
#endif
|
||||
Assert(t->vectorWidth <= ISPC_MAX_NVEC);
|
||||
}
|
||||
|
||||
return !error;
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
std::string
|
||||
Target::SupportedTargetCPUs() {
|
||||
return "atom, barcelona, core2, corei7, "
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
"corei7-avx, "
|
||||
#endif
|
||||
"istanbul, nocona, penryn, "
|
||||
#ifdef LLVM_2_9
|
||||
"sandybridge, "
|
||||
#endif
|
||||
"westmere";
|
||||
std::string ret;
|
||||
int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
ret += supportedCPUs[i];
|
||||
if (i != count - 1)
|
||||
ret += ", ";
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -271,14 +447,9 @@ Target::SupportedTargetArchs() {
|
||||
|
||||
const char *
|
||||
Target::SupportedTargetISAs() {
|
||||
return "sse2, sse2-x2, sse4, sse4-x2"
|
||||
#ifndef LLVM_2_9
|
||||
", avx, avx-x2"
|
||||
#endif // !LLVM_2_9
|
||||
#ifdef LLVM_3_1svn
|
||||
", avx2, avx2-x2"
|
||||
#endif // LLVM_3_1svn
|
||||
", generic-4, generic-8, generic-16, generic-1";
|
||||
return "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
|
||||
", avx1.1, avx1.1-x2, avx2, avx2-x2"
|
||||
", generic-1, generic-4, generic-8, generic-16, generic-32";
|
||||
}
|
||||
|
||||
|
||||
@@ -286,11 +457,7 @@ std::string
|
||||
Target::GetTripleString() const {
|
||||
llvm::Triple triple;
|
||||
// Start with the host triple as the default
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
|
||||
triple.setTriple(llvm::sys::getDefaultTargetTriple());
|
||||
#else
|
||||
triple.setTriple(llvm::sys::getHostTriple());
|
||||
#endif
|
||||
|
||||
// And override the arch in the host triple based on what the user
|
||||
// specified. Here we need to deal with the fact that LLVM uses one
|
||||
@@ -315,30 +482,15 @@ Target::GetTargetMachine() const {
|
||||
|
||||
llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ :
|
||||
llvm::Reloc::Default;
|
||||
#if defined(LLVM_3_1svn)
|
||||
std::string featuresString = attributes;
|
||||
llvm::TargetOptions options;
|
||||
if (g->opt.fastMath == true)
|
||||
options.UnsafeFPMath = 1;
|
||||
#if !defined(LLVM_3_1)
|
||||
if (g->opt.disableFMA == false)
|
||||
options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
|
||||
#endif // !LLVM_3_1
|
||||
llvm::TargetMachine *targetMachine =
|
||||
target->createTargetMachine(triple, cpu, featuresString, options,
|
||||
relocModel);
|
||||
#elif defined(LLVM_3_0)
|
||||
std::string featuresString = attributes;
|
||||
llvm::TargetMachine *targetMachine =
|
||||
target->createTargetMachine(triple, cpu, featuresString, relocModel);
|
||||
#else // LLVM 2.9
|
||||
#ifdef ISPC_IS_APPLE
|
||||
relocModel = llvm::Reloc::PIC_;
|
||||
#endif // ISPC_IS_APPLE
|
||||
std::string featuresString = cpu + std::string(",") + attributes;
|
||||
llvm::TargetMachine *targetMachine =
|
||||
target->createTargetMachine(triple, featuresString);
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
targetMachine->setRelocationModel(relocModel);
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
#endif // LLVM_2_9
|
||||
|
||||
Assert(targetMachine != NULL);
|
||||
|
||||
targetMachine->setAsmVerbosityDefault(true);
|
||||
@@ -355,6 +507,8 @@ Target::GetISAString() const {
|
||||
return "sse4";
|
||||
case Target::AVX:
|
||||
return "avx";
|
||||
case Target::AVX11:
|
||||
return "avx11";
|
||||
case Target::AVX2:
|
||||
return "avx2";
|
||||
case Target::GENERIC:
|
||||
@@ -367,7 +521,7 @@ Target::GetISAString() const {
|
||||
|
||||
|
||||
static bool
|
||||
lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
|
||||
lGenericTypeLayoutIndeterminate(llvm::Type *type) {
|
||||
if (type->isPrimitiveType() || type->isIntegerTy())
|
||||
return false;
|
||||
|
||||
@@ -376,18 +530,18 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
|
||||
type == LLVMTypes::Int1VectorType)
|
||||
return true;
|
||||
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
|
||||
llvm::ArrayType *at =
|
||||
llvm::dyn_cast<llvm::ArrayType>(type);
|
||||
if (at != NULL)
|
||||
return lGenericTypeLayoutIndeterminate(at->getElementType());
|
||||
|
||||
LLVM_TYPE_CONST llvm::PointerType *pt =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
|
||||
llvm::PointerType *pt =
|
||||
llvm::dyn_cast<llvm::PointerType>(type);
|
||||
if (pt != NULL)
|
||||
return false;
|
||||
|
||||
LLVM_TYPE_CONST llvm::StructType *st =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
|
||||
llvm::StructType *st =
|
||||
llvm::dyn_cast<llvm::StructType>(type);
|
||||
if (st != NULL) {
|
||||
for (int i = 0; i < (int)st->getNumElements(); ++i)
|
||||
if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
|
||||
@@ -395,29 +549,24 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
|
||||
Assert(llvm::isa<llvm::VectorType>(type));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
|
||||
Target::SizeOf(llvm::Type *type,
|
||||
llvm::BasicBlock *insertAtEnd) {
|
||||
if (isa == Target::GENERIC &&
|
||||
lGenericTypeLayoutIndeterminate(type)) {
|
||||
llvm::Value *index[1] = { LLVMInt32(1) };
|
||||
LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
|
||||
llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
|
||||
llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
|
||||
llvm::Instruction *gep =
|
||||
llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "sizeof_gep",
|
||||
insertAtEnd);
|
||||
#else
|
||||
llvm::Instruction *gep =
|
||||
llvm::GetElementPtrInst::Create(voidPtr, &index[0], &index[1],
|
||||
"sizeof_gep", insertAtEnd);
|
||||
#endif
|
||||
|
||||
if (is32Bit || g->opt.force32BitAddressing)
|
||||
return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type,
|
||||
"sizeof_int", insertAtEnd);
|
||||
@@ -426,9 +575,18 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
|
||||
"sizeof_int", insertAtEnd);
|
||||
}
|
||||
|
||||
#if defined(LLVM_3_1)
|
||||
const llvm::TargetData *td = GetTargetMachine()->getTargetData();
|
||||
Assert(td != NULL);
|
||||
uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
|
||||
uint64_t bitSize = td->getTypeSizeInBits(type);
|
||||
#else
|
||||
const llvm::DataLayout *dl = GetTargetMachine()->getDataLayout();
|
||||
Assert(dl != NULL);
|
||||
uint64_t bitSize = dl->getTypeSizeInBits(type);
|
||||
#endif
|
||||
|
||||
Assert((bitSize % 8) == 0);
|
||||
uint64_t byteSize = bitSize / 8;
|
||||
if (is32Bit || g->opt.force32BitAddressing)
|
||||
return LLVMInt32((int32_t)byteSize);
|
||||
else
|
||||
@@ -437,23 +595,18 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
|
||||
|
||||
|
||||
llvm::Value *
|
||||
Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
|
||||
Target::StructOffset(llvm::Type *type, int element,
|
||||
llvm::BasicBlock *insertAtEnd) {
|
||||
if (isa == Target::GENERIC &&
|
||||
lGenericTypeLayoutIndeterminate(type) == true) {
|
||||
llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
|
||||
LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
|
||||
llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
|
||||
llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
|
||||
llvm::Instruction *gep =
|
||||
llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "offset_gep",
|
||||
insertAtEnd);
|
||||
#else
|
||||
llvm::Instruction *gep =
|
||||
llvm::GetElementPtrInst::Create(voidPtr, &indices[0], &indices[2],
|
||||
"offset_gep", insertAtEnd);
|
||||
#endif
|
||||
|
||||
if (is32Bit || g->opt.force32BitAddressing)
|
||||
return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type,
|
||||
"offset_int", insertAtEnd);
|
||||
@@ -462,12 +615,22 @@ Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
|
||||
"offset_int", insertAtEnd);
|
||||
}
|
||||
|
||||
llvm::StructType *structType =
|
||||
llvm::dyn_cast<llvm::StructType>(type);
|
||||
if (structType == NULL || structType->isSized() == false) {
|
||||
Assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if defined(LLVM_3_1)
|
||||
const llvm::TargetData *td = GetTargetMachine()->getTargetData();
|
||||
Assert(td != NULL);
|
||||
LLVM_TYPE_CONST llvm::StructType *structType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
|
||||
Assert(structType != NULL);
|
||||
const llvm::StructLayout *sl = td->getStructLayout(structType);
|
||||
#else
|
||||
const llvm::DataLayout *dl = GetTargetMachine()->getDataLayout();
|
||||
Assert(dl != NULL);
|
||||
const llvm::StructLayout *sl = dl->getStructLayout(structType);
|
||||
#endif
|
||||
Assert(sl != NULL);
|
||||
|
||||
uint64_t offset = sl->getElementOffset(element);
|
||||
@@ -488,6 +651,8 @@ Opt::Opt() {
|
||||
force32BitAddressing = true;
|
||||
unrollLoops = true;
|
||||
disableAsserts = false;
|
||||
disableFMA = false;
|
||||
forceAlignedMemory = false;
|
||||
disableMaskAllOnOptimizations = false;
|
||||
disableHandlePseudoMemoryOps = false;
|
||||
disableBlendedMaskedStores = false;
|
||||
@@ -497,6 +662,7 @@ Opt::Opt() {
|
||||
disableMaskedStoreToStore = false;
|
||||
disableGatherScatterFlattening = false;
|
||||
disableUniformMemoryOptimizations = false;
|
||||
disableCoalescing = false;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -510,12 +676,16 @@ Globals::Globals() {
|
||||
debugPrint = false;
|
||||
disableWarnings = false;
|
||||
warningsAsErrors = false;
|
||||
quiet = false;
|
||||
forceColoredOutput = false;
|
||||
disableLineWrap = false;
|
||||
emitPerfWarnings = true;
|
||||
emitInstrumentation = false;
|
||||
generateDebuggingSymbols = false;
|
||||
enableFuzzTest = false;
|
||||
fuzzTestSeed = -1;
|
||||
mangleFunctionsWithTarget = false;
|
||||
|
||||
|
||||
ctx = new llvm::LLVMContext;
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
@@ -548,7 +718,9 @@ llvm::DIFile
|
||||
SourcePos::GetDIFile() const {
|
||||
std::string directory, filename;
|
||||
GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
|
||||
return m->diBuilder->createFile(filename, directory);
|
||||
llvm::DIFile ret = m->diBuilder->createFile(filename, directory);
|
||||
Assert(ret.Verify());
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
120
ispc.h
120
ispc.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -38,8 +38,10 @@
|
||||
#ifndef ISPC_H
|
||||
#define ISPC_H
|
||||
|
||||
#if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
|
||||
#error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
|
||||
#define ISPC_VERSION "1.3.1dev"
|
||||
|
||||
#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
|
||||
#error "Only LLVM 3.1, 3.2 and the 3.3 development branch are supported"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
@@ -49,6 +51,9 @@
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
#if defined(__KNC__)
|
||||
#define ISPC_IS_KNC
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
@@ -56,20 +61,10 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#define Assert(expr) \
|
||||
((void)((expr) ? 0 : __Assert (#expr, __FILE__, __LINE__)))
|
||||
#define __Assert(expr, file, line) \
|
||||
((void)fprintf(stderr, "%s:%u: Assertion failed: \"%s\"\n" \
|
||||
"***\n*** Please file a bug report at " \
|
||||
"https://github.com/ispc/ispc/issues\n*** (Including as much " \
|
||||
"information as you can about how to reproduce this error).\n" \
|
||||
"*** You have apparently encountered a bug in the compiler that " \
|
||||
"we'd like to fix!\n***\n", file, line, expr), abort(), 0)
|
||||
|
||||
/** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
|
||||
targets.
|
||||
*/
|
||||
#define ISPC_MAX_NVEC 16
|
||||
#define ISPC_MAX_NVEC 64
|
||||
|
||||
// Forward declarations of a number of widely-used LLVM types
|
||||
namespace llvm {
|
||||
@@ -90,12 +85,6 @@ namespace llvm {
|
||||
class Value;
|
||||
}
|
||||
|
||||
// llvm::Type *s are no longer const in llvm 3.0
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#define LLVM_TYPE_CONST
|
||||
#else
|
||||
#define LLVM_TYPE_CONST const
|
||||
#endif
|
||||
|
||||
class ArrayType;
|
||||
class AST;
|
||||
@@ -107,12 +96,22 @@ class ExprList;
|
||||
class Function;
|
||||
class FunctionType;
|
||||
class Module;
|
||||
class PointerType;
|
||||
class Stmt;
|
||||
class Symbol;
|
||||
class SymbolTable;
|
||||
class Type;
|
||||
struct VariableDeclaration;
|
||||
|
||||
enum StorageClass {
|
||||
SC_NONE,
|
||||
SC_EXTERN,
|
||||
SC_STATIC,
|
||||
SC_TYPEDEF,
|
||||
SC_EXTERN_C
|
||||
};
|
||||
|
||||
|
||||
/** @brief Representation of a range of positions in a source file.
|
||||
|
||||
This class represents a range of characters in a source file
|
||||
@@ -139,11 +138,25 @@ struct SourcePos {
|
||||
bool operator==(const SourcePos &p2) const;
|
||||
};
|
||||
|
||||
|
||||
/** Returns a SourcePos that encompasses the extent of both of the given
|
||||
extents. */
|
||||
SourcePos Union(const SourcePos &p1, const SourcePos &p2);
|
||||
|
||||
|
||||
|
||||
// Assert
|
||||
|
||||
extern void DoAssert(const char *file, int line, const char *expr);
|
||||
extern void DoAssertPos(SourcePos pos, const char *file, int line, const char *expr);
|
||||
|
||||
#define Assert(expr) \
|
||||
((void)((expr) ? 0 : ((void)DoAssert (__FILE__, __LINE__, #expr), 0)))
|
||||
|
||||
#define AssertPos(pos, expr) \
|
||||
((void)((expr) ? 0 : ((void)DoAssertPos (pos, __FILE__, __LINE__, #expr), 0)))
|
||||
|
||||
|
||||
/** @brief Structure that defines a compilation target
|
||||
|
||||
This structure defines a compilation target for the ispc compiler.
|
||||
@@ -161,7 +174,7 @@ struct Target {
|
||||
|
||||
/** Returns a comma-delimited string giving the names of the currently
|
||||
supported target CPUs. */
|
||||
static const char *SupportedTargetCPUs();
|
||||
static std::string SupportedTargetCPUs();
|
||||
|
||||
/** Returns a comma-delimited string giving the names of the currently
|
||||
supported target architectures. */
|
||||
@@ -179,13 +192,13 @@ struct Target {
|
||||
const char *GetISAString() const;
|
||||
|
||||
/** Returns the size of the given type */
|
||||
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *SizeOf(llvm::Type *type,
|
||||
llvm::BasicBlock *insertAtEnd);
|
||||
|
||||
/** Given a structure type and an element number in the structure,
|
||||
returns a value corresponding to the number of bytes from the start
|
||||
of the structure where the element is located. */
|
||||
llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *StructOffset(llvm::Type *type,
|
||||
int element, llvm::BasicBlock *insertAtEnd);
|
||||
|
||||
/** llvm Target object representing this target. */
|
||||
@@ -197,7 +210,7 @@ struct Target {
|
||||
flexible/performant of them will apear last in the enumerant. Note
|
||||
also that __best_available_isa() needs to be updated if ISAs are
|
||||
added or the enumerant values are reordered. */
|
||||
enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS };
|
||||
enum ISA { SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
|
||||
|
||||
/** Instruction set being compiled to. */
|
||||
ISA isa;
|
||||
@@ -233,16 +246,27 @@ struct Target {
|
||||
natively. */
|
||||
bool maskingIsFree;
|
||||
|
||||
/** Is it safe to run code with the mask all if: e.g. on SSE, the fast
|
||||
gather trick assumes that at least one program instance is running
|
||||
(so that it can safely assume that the array base pointer is
|
||||
valid). */
|
||||
bool allOffMaskIsSafe;
|
||||
|
||||
/** How many bits are used to store each element of the mask: e.g. this
|
||||
is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
|
||||
the generic target. */
|
||||
int maskBitCount;
|
||||
|
||||
/** Indicates whether the target has native support for float/half
|
||||
conversions. */
|
||||
bool hasHalf;
|
||||
|
||||
/** Indicates whether there is an ISA random number instruction. */
|
||||
bool hasRand;
|
||||
|
||||
/** Indicates whether the target has a native gather instruction */
|
||||
bool hasGather;
|
||||
|
||||
/** Indicates whether the target has a native scatter instruction */
|
||||
bool hasScatter;
|
||||
|
||||
/** Indicates whether the target has support for transcendentals (beyond
|
||||
sqrt, which we assume that all of them handle). */
|
||||
bool hasTranscendentals;
|
||||
};
|
||||
|
||||
|
||||
@@ -283,6 +307,16 @@ struct Opt {
|
||||
performance in the generated code). */
|
||||
bool disableAsserts;
|
||||
|
||||
/** Indicates whether FMA instructions should be disabled (on targets
|
||||
that support them). */
|
||||
bool disableFMA;
|
||||
|
||||
/** Always generate aligned vector load/store instructions; this
|
||||
implies a guarantee that all dynamic access through pointers that
|
||||
becomes a vector load/store will be a cache-aligned sequence of
|
||||
locations. */
|
||||
bool forceAlignedMemory;
|
||||
|
||||
/** If enabled, disables the various optimizations that kick in when
|
||||
the execution mask can be determined to be "all on" at compile
|
||||
time. */
|
||||
@@ -339,6 +373,10 @@ struct Opt {
|
||||
than gathers/scatters. This is likely only useful for measuring
|
||||
the impact of this optimization. */
|
||||
bool disableUniformMemoryOptimizations;
|
||||
|
||||
/** Disables optimizations that coalesce incoherent scalar memory
|
||||
access from gathers into wider vector operations, when possible. */
|
||||
bool disableCoalescing;
|
||||
};
|
||||
|
||||
/** @brief This structure collects together a number of global variables.
|
||||
@@ -388,6 +426,13 @@ struct Globals {
|
||||
possible performance pitfalls. */
|
||||
bool emitPerfWarnings;
|
||||
|
||||
/** Indicates whether all printed output should be surpressed. */
|
||||
bool quiet;
|
||||
|
||||
/** Always use ANSI escape sequences to colorize warning and error
|
||||
messages, even if piping output to a file, etc. */
|
||||
bool forceColoredOutput;
|
||||
|
||||
/** Indicates whether calls should be emitted in the program to an
|
||||
externally-defined program instrumentation function. (See the
|
||||
"Instrumenting your ispc programs" section in the user's
|
||||
@@ -402,6 +447,14 @@ struct Globals {
|
||||
vector width to them. */
|
||||
bool mangleFunctionsWithTarget;
|
||||
|
||||
/** If enabled, the lexer will randomly replace some tokens returned
|
||||
with other tokens, in order to test error condition handling in the
|
||||
compiler. */
|
||||
bool enableFuzzTest;
|
||||
|
||||
/** Seed for random number generator used for fuzz testing. */
|
||||
int fuzzTestSeed;
|
||||
|
||||
/** Global LLVMContext object */
|
||||
llvm::LLVMContext *ctx;
|
||||
|
||||
@@ -412,11 +465,14 @@ struct Globals {
|
||||
/** Arguments to pass along to the C pre-processor, if it is run on the
|
||||
program before compilation. */
|
||||
std::vector<std::string> cppArgs;
|
||||
|
||||
/** Additional user-provided directories to search when processing
|
||||
#include directives in the preprocessor. */
|
||||
std::vector<std::string> includePath;
|
||||
};
|
||||
|
||||
enum {
|
||||
COST_ASSIGN = 1,
|
||||
COST_COHERENT_BREAK_CONTINE = 4,
|
||||
COST_COMPLEX_ARITH_OP = 4,
|
||||
COST_DELETE = 32,
|
||||
COST_DEREF = 4,
|
||||
@@ -427,7 +483,7 @@ enum {
|
||||
COST_GOTO = 4,
|
||||
COST_LOAD = 2,
|
||||
COST_NEW = 32,
|
||||
COST_REGULAR_BREAK_CONTINUE = 2,
|
||||
COST_BREAK_CONTINUE = 3,
|
||||
COST_RETURN = 4,
|
||||
COST_SELECT = 4,
|
||||
COST_SIMPLE_ARITH_LOGIC_OP = 1,
|
||||
|
||||
64
ispc.vcxproj
64
ispc.vcxproj
@@ -20,6 +20,8 @@
|
||||
<ClCompile Include="func.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx1.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx1-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx11.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx11-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx2-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
||||
@@ -29,6 +31,8 @@
|
||||
<ClCompile Include="gen-bitcode-generic-4.cpp" />
|
||||
<ClCompile Include="gen-bitcode-generic-8.cpp" />
|
||||
<ClCompile Include="gen-bitcode-generic-16.cpp" />
|
||||
<ClCompile Include="gen-bitcode-generic-32.cpp" />
|
||||
<ClCompile Include="gen-bitcode-generic-64.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse2-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4.cpp" />
|
||||
@@ -186,6 +190,32 @@
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx11.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll > gen-bitcode-avx11.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx11.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll > gen-bitcode-avx11.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx11.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx11.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx11.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx11-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll > gen-bitcode-avx11-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx11-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll > gen-bitcode-avx11-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx11-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx11-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx11-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx2.ll">
|
||||
<FileType>Document</FileType>
|
||||
@@ -264,6 +294,32 @@
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-generic-32.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll > gen-bitcode-generic-32.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-32.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll > gen-bitcode-generic-32.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-32.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-32.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-32.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-generic-64.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll > gen-bitcode-generic-64.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-64.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll > gen-bitcode-generic-64.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-64.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-64.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-64.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="lex.ll">
|
||||
<FileType>Document</FileType>
|
||||
@@ -324,7 +380,7 @@
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>NOMINMAX;%LLVM_VERSION%</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
@@ -332,7 +388,7 @@
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
@@ -342,7 +398,7 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>NOMINMAX;%LLVM_VERSION%</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
@@ -352,7 +408,7 @@
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
652
lex.ll
652
lex.ll
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -43,31 +43,294 @@
|
||||
#include <stdint.h>
|
||||
|
||||
static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
|
||||
static int lParseInteger(bool dotdotdot);
|
||||
static void lCComment(SourcePos *);
|
||||
static void lCppComment(SourcePos *);
|
||||
static void lHandleCppHash(SourcePos *);
|
||||
static void lStringConst(YYSTYPE *, SourcePos *);
|
||||
static double lParseHexFloat(const char *ptr);
|
||||
extern void RegisterDependency(const std::string &fileName);
|
||||
|
||||
#define YY_USER_ACTION \
|
||||
yylloc->first_line = yylloc->last_line; \
|
||||
yylloc->first_column = yylloc->last_column; \
|
||||
yylloc->last_column += yyleng;
|
||||
yylloc.first_line = yylloc.last_line; \
|
||||
yylloc.first_column = yylloc.last_column; \
|
||||
yylloc.last_column += yyleng;
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
inline int isatty(int) { return 0; }
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
|
||||
static int allTokens[] = {
|
||||
TOKEN_ASSERT, TOKEN_BOOL, TOKEN_BREAK, TOKEN_CASE,
|
||||
TOKEN_CDO, TOKEN_CFOR, TOKEN_CIF, TOKEN_CWHILE,
|
||||
TOKEN_CONST, TOKEN_CONTINUE, TOKEN_DEFAULT, TOKEN_DO,
|
||||
TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
|
||||
TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
|
||||
TOKEN_FOREACH, TOKEN_FOREACH_ACTIVE, TOKEN_FOREACH_TILED,
|
||||
TOKEN_FOREACH_UNIQUE, TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
|
||||
TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
|
||||
TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
|
||||
TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
|
||||
TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
|
||||
TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
|
||||
TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
|
||||
TOKEN_FLOAT_CONSTANT,
|
||||
TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
|
||||
TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT,
|
||||
TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
|
||||
TOKEN_GE_OP, TOKEN_EQ_OP, TOKEN_NE_OP, TOKEN_AND_OP, TOKEN_OR_OP,
|
||||
TOKEN_MUL_ASSIGN, TOKEN_DIV_ASSIGN, TOKEN_MOD_ASSIGN, TOKEN_ADD_ASSIGN,
|
||||
TOKEN_SUB_ASSIGN, TOKEN_LEFT_ASSIGN, TOKEN_RIGHT_ASSIGN, TOKEN_AND_ASSIGN,
|
||||
TOKEN_XOR_ASSIGN, TOKEN_OR_ASSIGN, TOKEN_PTR_OP,
|
||||
';', '{', '}', ',', ':', '=', '(', ')', '[', ']', '.', '&', '!', '~', '-',
|
||||
'+', '*', '/', '%', '<', '>', '^', '|', '?',
|
||||
};
|
||||
|
||||
std::map<int, std::string> tokenToName;
|
||||
std::map<std::string, std::string> tokenNameRemap;
|
||||
|
||||
void ParserInit() {
|
||||
tokenToName[TOKEN_ASSERT] = "assert";
|
||||
tokenToName[TOKEN_BOOL] = "bool";
|
||||
tokenToName[TOKEN_BREAK] = "break";
|
||||
tokenToName[TOKEN_CASE] = "case";
|
||||
tokenToName[TOKEN_CDO] = "cdo";
|
||||
tokenToName[TOKEN_CFOR] = "cfor";
|
||||
tokenToName[TOKEN_CIF] = "cif";
|
||||
tokenToName[TOKEN_CWHILE] = "cwhile";
|
||||
tokenToName[TOKEN_CONST] = "const";
|
||||
tokenToName[TOKEN_CONTINUE] = "continue";
|
||||
tokenToName[TOKEN_DEFAULT] = "default";
|
||||
tokenToName[TOKEN_DO] = "do";
|
||||
tokenToName[TOKEN_DELETE] = "delete";
|
||||
tokenToName[TOKEN_DOUBLE] = "double";
|
||||
tokenToName[TOKEN_ELSE] = "else";
|
||||
tokenToName[TOKEN_ENUM] = "enum";
|
||||
tokenToName[TOKEN_EXPORT] = "export";
|
||||
tokenToName[TOKEN_EXTERN] = "extern";
|
||||
tokenToName[TOKEN_FALSE] = "false";
|
||||
tokenToName[TOKEN_FLOAT] = "float";
|
||||
tokenToName[TOKEN_FOR] = "for";
|
||||
tokenToName[TOKEN_FOREACH] = "foreach";
|
||||
tokenToName[TOKEN_FOREACH_ACTIVE] = "foreach_active";
|
||||
tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
|
||||
tokenToName[TOKEN_FOREACH_UNIQUE] = "foreach_unique";
|
||||
tokenToName[TOKEN_GOTO] = "goto";
|
||||
tokenToName[TOKEN_IF] = "if";
|
||||
tokenToName[TOKEN_IN] = "in";
|
||||
tokenToName[TOKEN_INLINE] = "inline";
|
||||
tokenToName[TOKEN_INT] = "int";
|
||||
tokenToName[TOKEN_INT8] = "int8";
|
||||
tokenToName[TOKEN_INT16] = "int16";
|
||||
tokenToName[TOKEN_INT] = "int";
|
||||
tokenToName[TOKEN_INT64] = "int64";
|
||||
tokenToName[TOKEN_LAUNCH] = "launch";
|
||||
tokenToName[TOKEN_NEW] = "new";
|
||||
tokenToName[TOKEN_NULL] = "NULL";
|
||||
tokenToName[TOKEN_PRINT] = "print";
|
||||
tokenToName[TOKEN_RETURN] = "return";
|
||||
tokenToName[TOKEN_SOA] = "soa";
|
||||
tokenToName[TOKEN_SIGNED] = "signed";
|
||||
tokenToName[TOKEN_SIZEOF] = "sizeof";
|
||||
tokenToName[TOKEN_STATIC] = "static";
|
||||
tokenToName[TOKEN_STRUCT] = "struct";
|
||||
tokenToName[TOKEN_SWITCH] = "switch";
|
||||
tokenToName[TOKEN_SYNC] = "sync";
|
||||
tokenToName[TOKEN_TASK] = "task";
|
||||
tokenToName[TOKEN_TRUE] = "true";
|
||||
tokenToName[TOKEN_TYPEDEF] = "typedef";
|
||||
tokenToName[TOKEN_UNIFORM] = "uniform";
|
||||
tokenToName[TOKEN_UNMASKED] = "unmasked";
|
||||
tokenToName[TOKEN_UNSIGNED] = "unsigned";
|
||||
tokenToName[TOKEN_VARYING] = "varying";
|
||||
tokenToName[TOKEN_VOID] = "void";
|
||||
tokenToName[TOKEN_WHILE] = "while";
|
||||
tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
|
||||
tokenToName[TOKEN_DOTDOTDOT] = "...";
|
||||
tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
|
||||
tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
|
||||
tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
|
||||
tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
|
||||
tokenToName[TOKEN_UINT64_CONSTANT] = "TOKEN_UINT64_CONSTANT";
|
||||
tokenToName[TOKEN_INC_OP] = "++";
|
||||
tokenToName[TOKEN_DEC_OP] = "--";
|
||||
tokenToName[TOKEN_LEFT_OP] = "<<";
|
||||
tokenToName[TOKEN_RIGHT_OP] = ">>";
|
||||
tokenToName[TOKEN_LE_OP] = "<=";
|
||||
tokenToName[TOKEN_GE_OP] = ">=";
|
||||
tokenToName[TOKEN_EQ_OP] = "==";
|
||||
tokenToName[TOKEN_NE_OP] = "!=";
|
||||
tokenToName[TOKEN_AND_OP] = "&&";
|
||||
tokenToName[TOKEN_OR_OP] = "||";
|
||||
tokenToName[TOKEN_MUL_ASSIGN] = "*=";
|
||||
tokenToName[TOKEN_DIV_ASSIGN] = "/=";
|
||||
tokenToName[TOKEN_MOD_ASSIGN] = "%=";
|
||||
tokenToName[TOKEN_ADD_ASSIGN] = "+=";
|
||||
tokenToName[TOKEN_SUB_ASSIGN] = "-=";
|
||||
tokenToName[TOKEN_LEFT_ASSIGN] = "<<=";
|
||||
tokenToName[TOKEN_RIGHT_ASSIGN] = ">>=";
|
||||
tokenToName[TOKEN_AND_ASSIGN] = "&=";
|
||||
tokenToName[TOKEN_XOR_ASSIGN] = "^=";
|
||||
tokenToName[TOKEN_OR_ASSIGN] = "|=";
|
||||
tokenToName[TOKEN_PTR_OP] = "->";
|
||||
tokenToName[';'] = ";";
|
||||
tokenToName['{'] = "{";
|
||||
tokenToName['}'] = "}";
|
||||
tokenToName[','] = ",";
|
||||
tokenToName[':'] = ":";
|
||||
tokenToName['='] = "=";
|
||||
tokenToName['('] = "(";
|
||||
tokenToName[')'] = ")";
|
||||
tokenToName['['] = "[";
|
||||
tokenToName[']'] = "]";
|
||||
tokenToName['.'] = ".";
|
||||
tokenToName['&'] = "&";
|
||||
tokenToName['!'] = "!";
|
||||
tokenToName['~'] = "~";
|
||||
tokenToName['-'] = "-";
|
||||
tokenToName['+'] = "+";
|
||||
tokenToName['*'] = "*";
|
||||
tokenToName['/'] = "/";
|
||||
tokenToName['%'] = "%";
|
||||
tokenToName['<'] = "<";
|
||||
tokenToName['>'] = ">";
|
||||
tokenToName['^'] = "^";
|
||||
tokenToName['|'] = "|";
|
||||
tokenToName['?'] = "?";
|
||||
tokenToName[';'] = ";";
|
||||
|
||||
tokenNameRemap["TOKEN_ASSERT"] = "\'assert\'";
|
||||
tokenNameRemap["TOKEN_BOOL"] = "\'bool\'";
|
||||
tokenNameRemap["TOKEN_BREAK"] = "\'break\'";
|
||||
tokenNameRemap["TOKEN_CASE"] = "\'case\'";
|
||||
tokenNameRemap["TOKEN_CDO"] = "\'cdo\'";
|
||||
tokenNameRemap["TOKEN_CFOR"] = "\'cfor\'";
|
||||
tokenNameRemap["TOKEN_CIF"] = "\'cif\'";
|
||||
tokenNameRemap["TOKEN_CWHILE"] = "\'cwhile\'";
|
||||
tokenNameRemap["TOKEN_CONST"] = "\'const\'";
|
||||
tokenNameRemap["TOKEN_CONTINUE"] = "\'continue\'";
|
||||
tokenNameRemap["TOKEN_DEFAULT"] = "\'default\'";
|
||||
tokenNameRemap["TOKEN_DO"] = "\'do\'";
|
||||
tokenNameRemap["TOKEN_DELETE"] = "\'delete\'";
|
||||
tokenNameRemap["TOKEN_DOUBLE"] = "\'double\'";
|
||||
tokenNameRemap["TOKEN_ELSE"] = "\'else\'";
|
||||
tokenNameRemap["TOKEN_ENUM"] = "\'enum\'";
|
||||
tokenNameRemap["TOKEN_EXPORT"] = "\'export\'";
|
||||
tokenNameRemap["TOKEN_EXTERN"] = "\'extern\'";
|
||||
tokenNameRemap["TOKEN_FALSE"] = "\'false\'";
|
||||
tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
|
||||
tokenNameRemap["TOKEN_FOR"] = "\'for\'";
|
||||
tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
|
||||
tokenNameRemap["TOKEN_FOREACH_ACTIVE"] = "\'foreach_active\'";
|
||||
tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
|
||||
tokenNameRemap["TOKEN_FOREACH_UNIQUE"] = "\'foreach_unique\'";
|
||||
tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
|
||||
tokenNameRemap["TOKEN_IDENTIFIER"] = "identifier";
|
||||
tokenNameRemap["TOKEN_IF"] = "\'if\'";
|
||||
tokenNameRemap["TOKEN_IN"] = "\'in\'";
|
||||
tokenNameRemap["TOKEN_INLINE"] = "\'inline\'";
|
||||
tokenNameRemap["TOKEN_INT"] = "\'int\'";
|
||||
tokenNameRemap["TOKEN_INT8"] = "\'int8\'";
|
||||
tokenNameRemap["TOKEN_INT16"] = "\'int16\'";
|
||||
tokenNameRemap["TOKEN_INT"] = "\'int\'";
|
||||
tokenNameRemap["TOKEN_INT64"] = "\'int64\'";
|
||||
tokenNameRemap["TOKEN_LAUNCH"] = "\'launch\'";
|
||||
tokenNameRemap["TOKEN_NEW"] = "\'new\'";
|
||||
tokenNameRemap["TOKEN_NULL"] = "\'NULL\'";
|
||||
tokenNameRemap["TOKEN_PRINT"] = "\'print\'";
|
||||
tokenNameRemap["TOKEN_RETURN"] = "\'return\'";
|
||||
tokenNameRemap["TOKEN_SOA"] = "\'soa\'";
|
||||
tokenNameRemap["TOKEN_SIGNED"] = "\'signed\'";
|
||||
tokenNameRemap["TOKEN_SIZEOF"] = "\'sizeof\'";
|
||||
tokenNameRemap["TOKEN_STATIC"] = "\'static\'";
|
||||
tokenNameRemap["TOKEN_STRUCT"] = "\'struct\'";
|
||||
tokenNameRemap["TOKEN_SWITCH"] = "\'switch\'";
|
||||
tokenNameRemap["TOKEN_SYNC"] = "\'sync\'";
|
||||
tokenNameRemap["TOKEN_TASK"] = "\'task\'";
|
||||
tokenNameRemap["TOKEN_TRUE"] = "\'true\'";
|
||||
tokenNameRemap["TOKEN_TYPEDEF"] = "\'typedef\'";
|
||||
tokenNameRemap["TOKEN_UNIFORM"] = "\'uniform\'";
|
||||
tokenNameRemap["TOKEN_UNMASKED"] = "\'unmasked\'";
|
||||
tokenNameRemap["TOKEN_UNSIGNED"] = "\'unsigned\'";
|
||||
tokenNameRemap["TOKEN_VARYING"] = "\'varying\'";
|
||||
tokenNameRemap["TOKEN_VOID"] = "\'void\'";
|
||||
tokenNameRemap["TOKEN_WHILE"] = "\'while\'";
|
||||
tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
|
||||
tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
|
||||
tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
|
||||
tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
|
||||
tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
|
||||
tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
|
||||
tokenNameRemap["TOKEN_UINT64_CONSTANT"] = "unsigned int64 constant";
|
||||
tokenNameRemap["TOKEN_INC_OP"] = "\'++\'";
|
||||
tokenNameRemap["TOKEN_DEC_OP"] = "\'--\'";
|
||||
tokenNameRemap["TOKEN_LEFT_OP"] = "\'<<\'";
|
||||
tokenNameRemap["TOKEN_RIGHT_OP"] = "\'>>\'";
|
||||
tokenNameRemap["TOKEN_LE_OP"] = "\'<=\'";
|
||||
tokenNameRemap["TOKEN_GE_OP"] = "\'>=\'";
|
||||
tokenNameRemap["TOKEN_EQ_OP"] = "\'==\'";
|
||||
tokenNameRemap["TOKEN_NE_OP"] = "\'!=\'";
|
||||
tokenNameRemap["TOKEN_AND_OP"] = "\'&&\'";
|
||||
tokenNameRemap["TOKEN_OR_OP"] = "\'||\'";
|
||||
tokenNameRemap["TOKEN_MUL_ASSIGN"] = "\'*=\'";
|
||||
tokenNameRemap["TOKEN_DIV_ASSIGN"] = "\'/=\'";
|
||||
tokenNameRemap["TOKEN_MOD_ASSIGN"] = "\'%=\'";
|
||||
tokenNameRemap["TOKEN_ADD_ASSIGN"] = "\'+=\'";
|
||||
tokenNameRemap["TOKEN_SUB_ASSIGN"] = "\'-=\'";
|
||||
tokenNameRemap["TOKEN_LEFT_ASSIGN"] = "\'<<=\'";
|
||||
tokenNameRemap["TOKEN_RIGHT_ASSIGN"] = "\'>>=\'";
|
||||
tokenNameRemap["TOKEN_AND_ASSIGN"] = "\'&=\'";
|
||||
tokenNameRemap["TOKEN_XOR_ASSIGN"] = "\'^=\'";
|
||||
tokenNameRemap["TOKEN_OR_ASSIGN"] = "\'|=\'";
|
||||
tokenNameRemap["TOKEN_PTR_OP"] = "\'->\'";
|
||||
tokenNameRemap["$end"] = "end of file";
|
||||
}
|
||||
|
||||
|
||||
inline int ispcRand() {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return rand();
|
||||
#else
|
||||
return lrand48();
|
||||
#endif
|
||||
}
|
||||
|
||||
#define RT \
|
||||
if (g->enableFuzzTest) { \
|
||||
int r = ispcRand() % 40; \
|
||||
if (r == 0) { \
|
||||
Warning(yylloc, "Fuzz test dropping token"); \
|
||||
} \
|
||||
else if (r == 1) { \
|
||||
Assert (tokenToName.size() > 0); \
|
||||
int nt = sizeof(allTokens) / sizeof(allTokens[0]); \
|
||||
int tn = ispcRand() % nt; \
|
||||
yylval.stringVal = new std::string(yytext); /* just in case */\
|
||||
Warning(yylloc, "Fuzz test replaced token with \"%s\"", tokenToName[allTokens[tn]].c_str()); \
|
||||
return allTokens[tn]; \
|
||||
} \
|
||||
else if (r == 2) { \
|
||||
Symbol *sym = m->symbolTable->RandomSymbol(); \
|
||||
if (sym != NULL) { \
|
||||
yylval.stringVal = new std::string(sym->name); \
|
||||
Warning(yylloc, "Fuzz test replaced with identifier \"%s\".", sym->name.c_str()); \
|
||||
return TOKEN_IDENTIFIER; \
|
||||
} \
|
||||
} \
|
||||
/* TOKEN_TYPE_NAME */ \
|
||||
} else /* swallow semicolon */
|
||||
|
||||
%}
|
||||
|
||||
%option nounput
|
||||
%option noyywrap
|
||||
%option bison-bridge
|
||||
%option bison-locations
|
||||
%option nounistd
|
||||
|
||||
WHITESPACE [ \t\r]+
|
||||
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
|
||||
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
|
||||
INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
|
||||
FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
|
||||
HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
|
||||
|
||||
@@ -75,200 +338,167 @@ IDENT [a-zA-Z_][a-zA-Z_0-9]*
|
||||
ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
|
||||
|
||||
%%
|
||||
"/*" { lCComment(yylloc); }
|
||||
"//" { lCppComment(yylloc); }
|
||||
"/*" { lCComment(&yylloc); }
|
||||
"//" { lCppComment(&yylloc); }
|
||||
|
||||
__assert { return TOKEN_ASSERT; }
|
||||
bool { return TOKEN_BOOL; }
|
||||
break { return TOKEN_BREAK; }
|
||||
case { return TOKEN_CASE; }
|
||||
cbreak { return TOKEN_CBREAK; }
|
||||
ccontinue { return TOKEN_CCONTINUE; }
|
||||
cdo { return TOKEN_CDO; }
|
||||
cfor { return TOKEN_CFOR; }
|
||||
cif { return TOKEN_CIF; }
|
||||
cwhile { return TOKEN_CWHILE; }
|
||||
const { return TOKEN_CONST; }
|
||||
continue { return TOKEN_CONTINUE; }
|
||||
creturn { return TOKEN_CRETURN; }
|
||||
default { return TOKEN_DEFAULT; }
|
||||
do { return TOKEN_DO; }
|
||||
delete { return TOKEN_DELETE; }
|
||||
delete\[\] { return TOKEN_DELETE; }
|
||||
double { return TOKEN_DOUBLE; }
|
||||
else { return TOKEN_ELSE; }
|
||||
enum { return TOKEN_ENUM; }
|
||||
export { return TOKEN_EXPORT; }
|
||||
extern { return TOKEN_EXTERN; }
|
||||
false { return TOKEN_FALSE; }
|
||||
float { return TOKEN_FLOAT; }
|
||||
for { return TOKEN_FOR; }
|
||||
foreach { return TOKEN_FOREACH; }
|
||||
foreach_tiled { return TOKEN_FOREACH_TILED; }
|
||||
goto { return TOKEN_GOTO; }
|
||||
if { return TOKEN_IF; }
|
||||
inline { return TOKEN_INLINE; }
|
||||
int { return TOKEN_INT; }
|
||||
int8 { return TOKEN_INT8; }
|
||||
int16 { return TOKEN_INT16; }
|
||||
int32 { return TOKEN_INT; }
|
||||
int64 { return TOKEN_INT64; }
|
||||
launch { return TOKEN_LAUNCH; }
|
||||
new { return TOKEN_NEW; }
|
||||
NULL { return TOKEN_NULL; }
|
||||
print { return TOKEN_PRINT; }
|
||||
reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
|
||||
"please use C++-style '&' syntax for references "
|
||||
"instead."); }
|
||||
return { return TOKEN_RETURN; }
|
||||
soa { return TOKEN_SOA; }
|
||||
signed { return TOKEN_SIGNED; }
|
||||
sizeof { return TOKEN_SIZEOF; }
|
||||
static { return TOKEN_STATIC; }
|
||||
struct { return TOKEN_STRUCT; }
|
||||
switch { return TOKEN_SWITCH; }
|
||||
sync { return TOKEN_SYNC; }
|
||||
task { return TOKEN_TASK; }
|
||||
true { return TOKEN_TRUE; }
|
||||
typedef { return TOKEN_TYPEDEF; }
|
||||
uniform { return TOKEN_UNIFORM; }
|
||||
unsigned { return TOKEN_UNSIGNED; }
|
||||
varying { return TOKEN_VARYING; }
|
||||
void { return TOKEN_VOID; }
|
||||
while { return TOKEN_WHILE; }
|
||||
\"C\" { return TOKEN_STRING_C_LITERAL; }
|
||||
\.\.\. { return TOKEN_DOTDOTDOT; }
|
||||
__assert { RT; return TOKEN_ASSERT; }
|
||||
bool { RT; return TOKEN_BOOL; }
|
||||
break { RT; return TOKEN_BREAK; }
|
||||
case { RT; return TOKEN_CASE; }
|
||||
cbreak { RT; Warning(yylloc, "\"cbreak\" is deprecated. Use \"break\"."); return TOKEN_BREAK; }
|
||||
ccontinue { RT; Warning(yylloc, "\"ccontinue\" is deprecated. Use \"continue\"."); return TOKEN_CONTINUE; }
|
||||
cdo { RT; return TOKEN_CDO; }
|
||||
cfor { RT; return TOKEN_CFOR; }
|
||||
cif { RT; return TOKEN_CIF; }
|
||||
cwhile { RT; return TOKEN_CWHILE; }
|
||||
const { RT; return TOKEN_CONST; }
|
||||
continue { RT; return TOKEN_CONTINUE; }
|
||||
creturn { RT; Warning(yylloc, "\"creturn\" is deprecated. Use \"return\"."); return TOKEN_RETURN; }
|
||||
__declspec { RT; return TOKEN_DECLSPEC; }
|
||||
default { RT; return TOKEN_DEFAULT; }
|
||||
do { RT; return TOKEN_DO; }
|
||||
delete { RT; return TOKEN_DELETE; }
|
||||
delete\[\] { RT; return TOKEN_DELETE; }
|
||||
double { RT; return TOKEN_DOUBLE; }
|
||||
else { RT; return TOKEN_ELSE; }
|
||||
enum { RT; return TOKEN_ENUM; }
|
||||
export { RT; return TOKEN_EXPORT; }
|
||||
extern { RT; return TOKEN_EXTERN; }
|
||||
false { RT; return TOKEN_FALSE; }
|
||||
float { RT; return TOKEN_FLOAT; }
|
||||
for { RT; return TOKEN_FOR; }
|
||||
foreach { RT; return TOKEN_FOREACH; }
|
||||
foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
|
||||
foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
|
||||
foreach_unique { RT; return TOKEN_FOREACH_UNIQUE; }
|
||||
goto { RT; return TOKEN_GOTO; }
|
||||
if { RT; return TOKEN_IF; }
|
||||
in { RT; return TOKEN_IN; }
|
||||
inline { RT; return TOKEN_INLINE; }
|
||||
int { RT; return TOKEN_INT; }
|
||||
int8 { RT; return TOKEN_INT8; }
|
||||
int16 { RT; return TOKEN_INT16; }
|
||||
int32 { RT; return TOKEN_INT; }
|
||||
int64 { RT; return TOKEN_INT64; }
|
||||
launch { RT; return TOKEN_LAUNCH; }
|
||||
new { RT; return TOKEN_NEW; }
|
||||
NULL { RT; return TOKEN_NULL; }
|
||||
print { RT; return TOKEN_PRINT; }
|
||||
return { RT; return TOKEN_RETURN; }
|
||||
soa { RT; return TOKEN_SOA; }
|
||||
signed { RT; return TOKEN_SIGNED; }
|
||||
sizeof { RT; return TOKEN_SIZEOF; }
|
||||
static { RT; return TOKEN_STATIC; }
|
||||
struct { RT; return TOKEN_STRUCT; }
|
||||
switch { RT; return TOKEN_SWITCH; }
|
||||
sync { RT; return TOKEN_SYNC; }
|
||||
task { RT; return TOKEN_TASK; }
|
||||
true { RT; return TOKEN_TRUE; }
|
||||
typedef { RT; return TOKEN_TYPEDEF; }
|
||||
uniform { RT; return TOKEN_UNIFORM; }
|
||||
unmasked { RT; return TOKEN_UNMASKED; }
|
||||
unsigned { RT; return TOKEN_UNSIGNED; }
|
||||
varying { RT; return TOKEN_VARYING; }
|
||||
void { RT; return TOKEN_VOID; }
|
||||
while { RT; return TOKEN_WHILE; }
|
||||
\"C\" { RT; return TOKEN_STRING_C_LITERAL; }
|
||||
\.\.\. { RT; return TOKEN_DOTDOTDOT; }
|
||||
|
||||
L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
|
||||
L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; }
|
||||
|
||||
{IDENT} {
|
||||
RT;
|
||||
/* We have an identifier--is it a type name or an identifier?
|
||||
The symbol table will straighten us out... */
|
||||
yylval->stringVal = new std::string(yytext);
|
||||
yylval.stringVal = new std::string(yytext);
|
||||
if (m->symbolTable->LookupType(yytext) != NULL)
|
||||
return TOKEN_TYPE_NAME;
|
||||
else
|
||||
return TOKEN_IDENTIFIER;
|
||||
}
|
||||
|
||||
{INT_NUMBER}+(u|U|l|L)*? {
|
||||
int ls = 0, us = 0;
|
||||
{INT_NUMBER} {
|
||||
RT;
|
||||
return lParseInteger(false);
|
||||
}
|
||||
|
||||
char *endPtr = NULL;
|
||||
if (yytext[0] == '0' && yytext[1] == 'b')
|
||||
yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
|
||||
else {
|
||||
#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
|
||||
yylval->intVal = _strtoui64(yytext, &endPtr, 0);
|
||||
#else
|
||||
// FIXME: should use strtouq and then issue an error if we can't
|
||||
// fit into 64 bits...
|
||||
yylval->intVal = strtoull(yytext, &endPtr, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool kilo = false, mega = false, giga = false;
|
||||
for (; *endPtr; endPtr++) {
|
||||
if (*endPtr == 'k')
|
||||
kilo = true;
|
||||
else if (*endPtr == 'M')
|
||||
mega = true;
|
||||
else if (*endPtr == 'G')
|
||||
giga = true;
|
||||
else if (*endPtr == 'l' || *endPtr == 'L')
|
||||
ls++;
|
||||
else if (*endPtr == 'u' || *endPtr == 'U')
|
||||
us++;
|
||||
}
|
||||
if (kilo)
|
||||
yylval->intVal *= 1024;
|
||||
if (mega)
|
||||
yylval->intVal *= 1024*1024;
|
||||
if (giga)
|
||||
yylval->intVal *= 1024*1024*1024;
|
||||
|
||||
if (ls >= 2)
|
||||
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
|
||||
else if (ls == 1)
|
||||
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
|
||||
|
||||
// See if we can fit this into a 32-bit integer...
|
||||
if ((yylval->intVal & 0xffffffff) == yylval->intVal)
|
||||
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
|
||||
else
|
||||
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
|
||||
{INT_NUMBER_DOTDOTDOT} {
|
||||
RT;
|
||||
return lParseInteger(true);
|
||||
}
|
||||
|
||||
|
||||
{FLOAT_NUMBER} {
|
||||
yylval->floatVal = (float)atof(yytext);
|
||||
RT;
|
||||
yylval.floatVal = (float)atof(yytext);
|
||||
return TOKEN_FLOAT_CONSTANT;
|
||||
}
|
||||
|
||||
{HEX_FLOAT_NUMBER} {
|
||||
yylval->floatVal = (float)lParseHexFloat(yytext);
|
||||
RT;
|
||||
yylval.floatVal = (float)lParseHexFloat(yytext);
|
||||
return TOKEN_FLOAT_CONSTANT;
|
||||
}
|
||||
|
||||
"++" { return TOKEN_INC_OP; }
|
||||
"--" { return TOKEN_DEC_OP; }
|
||||
"<<" { return TOKEN_LEFT_OP; }
|
||||
">>" { return TOKEN_RIGHT_OP; }
|
||||
"<=" { return TOKEN_LE_OP; }
|
||||
">=" { return TOKEN_GE_OP; }
|
||||
"==" { return TOKEN_EQ_OP; }
|
||||
"!=" { return TOKEN_NE_OP; }
|
||||
"&&" { return TOKEN_AND_OP; }
|
||||
"||" { return TOKEN_OR_OP; }
|
||||
"*=" { return TOKEN_MUL_ASSIGN; }
|
||||
"/=" { return TOKEN_DIV_ASSIGN; }
|
||||
"%=" { return TOKEN_MOD_ASSIGN; }
|
||||
"+=" { return TOKEN_ADD_ASSIGN; }
|
||||
"-=" { return TOKEN_SUB_ASSIGN; }
|
||||
"<<=" { return TOKEN_LEFT_ASSIGN; }
|
||||
">>=" { return TOKEN_RIGHT_ASSIGN; }
|
||||
"&=" { return TOKEN_AND_ASSIGN; }
|
||||
"^=" { return TOKEN_XOR_ASSIGN; }
|
||||
"|=" { return TOKEN_OR_ASSIGN; }
|
||||
"->" { return TOKEN_PTR_OP; }
|
||||
";" { return ';'; }
|
||||
("{"|"<%") { return '{'; }
|
||||
("}"|"%>") { return '}'; }
|
||||
"," { return ','; }
|
||||
":" { return ':'; }
|
||||
"=" { return '='; }
|
||||
"(" { return '('; }
|
||||
")" { return ')'; }
|
||||
("["|"<:") { return '['; }
|
||||
("]"|":>") { return ']'; }
|
||||
"." { return '.'; }
|
||||
"&" { return '&'; }
|
||||
"!" { return '!'; }
|
||||
"~" { return '~'; }
|
||||
"-" { return '-'; }
|
||||
"+" { return '+'; }
|
||||
"*" { return '*'; }
|
||||
"/" { return '/'; }
|
||||
"%" { return '%'; }
|
||||
"<" { return '<'; }
|
||||
">" { return '>'; }
|
||||
"^" { return '^'; }
|
||||
"|" { return '|'; }
|
||||
"?" { return '?'; }
|
||||
"++" { RT; return TOKEN_INC_OP; }
|
||||
"--" { RT; return TOKEN_DEC_OP; }
|
||||
"<<" { RT; return TOKEN_LEFT_OP; }
|
||||
">>" { RT; return TOKEN_RIGHT_OP; }
|
||||
"<=" { RT; return TOKEN_LE_OP; }
|
||||
">=" { RT; return TOKEN_GE_OP; }
|
||||
"==" { RT; return TOKEN_EQ_OP; }
|
||||
"!=" { RT; return TOKEN_NE_OP; }
|
||||
"&&" { RT; return TOKEN_AND_OP; }
|
||||
"||" { RT; return TOKEN_OR_OP; }
|
||||
"*=" { RT; return TOKEN_MUL_ASSIGN; }
|
||||
"/=" { RT; return TOKEN_DIV_ASSIGN; }
|
||||
"%=" { RT; return TOKEN_MOD_ASSIGN; }
|
||||
"+=" { RT; return TOKEN_ADD_ASSIGN; }
|
||||
"-=" { RT; return TOKEN_SUB_ASSIGN; }
|
||||
"<<=" { RT; return TOKEN_LEFT_ASSIGN; }
|
||||
">>=" { RT; return TOKEN_RIGHT_ASSIGN; }
|
||||
"&=" { RT; return TOKEN_AND_ASSIGN; }
|
||||
"^=" { RT; return TOKEN_XOR_ASSIGN; }
|
||||
"|=" { RT; return TOKEN_OR_ASSIGN; }
|
||||
"->" { RT; return TOKEN_PTR_OP; }
|
||||
";" { RT; return ';'; }
|
||||
("{"|"<%") { RT; return '{'; }
|
||||
("}"|"%>") { RT; return '}'; }
|
||||
"," { RT; return ','; }
|
||||
":" { RT; return ':'; }
|
||||
"=" { RT; return '='; }
|
||||
"(" { RT; return '('; }
|
||||
")" { RT; return ')'; }
|
||||
("["|"<:") { RT; return '['; }
|
||||
("]"|":>") { RT; return ']'; }
|
||||
"." { RT; return '.'; }
|
||||
"&" { RT; return '&'; }
|
||||
"!" { RT; return '!'; }
|
||||
"~" { RT; return '~'; }
|
||||
"-" { RT; return '-'; }
|
||||
"+" { RT; return '+'; }
|
||||
"*" { RT; return '*'; }
|
||||
"/" { RT; return '/'; }
|
||||
"%" { RT; return '%'; }
|
||||
"<" { RT; return '<'; }
|
||||
">" { RT; return '>'; }
|
||||
"^" { RT; return '^'; }
|
||||
"|" { RT; return '|'; }
|
||||
"?" { RT; return '?'; }
|
||||
|
||||
{WHITESPACE} { }
|
||||
|
||||
\n {
|
||||
yylloc->last_line++;
|
||||
yylloc->last_column = 1;
|
||||
yylloc.last_line++;
|
||||
yylloc.last_column = 1;
|
||||
}
|
||||
|
||||
#(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* {
|
||||
lHandleCppHash(yylloc);
|
||||
lHandleCppHash(&yylloc);
|
||||
}
|
||||
|
||||
. {
|
||||
Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
|
||||
Error(yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
|
||||
YY_USER_ACTION
|
||||
}
|
||||
|
||||
@@ -304,13 +534,94 @@ lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
lParseInteger(bool dotdotdot) {
|
||||
int ls = 0, us = 0;
|
||||
|
||||
char *endPtr = NULL;
|
||||
if (yytext[0] == '0' && yytext[1] == 'b')
|
||||
yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
|
||||
else {
|
||||
#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
|
||||
yylval.intVal = _strtoui64(yytext, &endPtr, 0);
|
||||
#else
|
||||
// FIXME: should use strtouq and then issue an error if we can't
|
||||
// fit into 64 bits...
|
||||
yylval.intVal = strtoull(yytext, &endPtr, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool kilo = false, mega = false, giga = false;
|
||||
for (; *endPtr; endPtr++) {
|
||||
if (*endPtr == 'k')
|
||||
kilo = true;
|
||||
else if (*endPtr == 'M')
|
||||
mega = true;
|
||||
else if (*endPtr == 'G')
|
||||
giga = true;
|
||||
else if (*endPtr == 'l' || *endPtr == 'L')
|
||||
ls++;
|
||||
else if (*endPtr == 'u' || *endPtr == 'U')
|
||||
us++;
|
||||
else
|
||||
Assert(dotdotdot && *endPtr == '.');
|
||||
}
|
||||
if (kilo)
|
||||
yylval.intVal *= 1024;
|
||||
if (mega)
|
||||
yylval.intVal *= 1024*1024;
|
||||
if (giga)
|
||||
yylval.intVal *= 1024*1024*1024;
|
||||
|
||||
if (dotdotdot) {
|
||||
if (ls >= 2)
|
||||
return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
|
||||
else if (ls == 1)
|
||||
return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
|
||||
|
||||
// See if we can fit this into a 32-bit integer...
|
||||
if ((yylval.intVal & 0xffffffff) == yylval.intVal)
|
||||
return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
|
||||
else
|
||||
return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
|
||||
}
|
||||
else {
|
||||
if (ls >= 2)
|
||||
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
|
||||
else if (ls == 1)
|
||||
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
|
||||
else if (us) {
|
||||
// u suffix only
|
||||
if (yylval.intVal <= 0xffffffffL)
|
||||
return TOKEN_UINT32_CONSTANT;
|
||||
else
|
||||
return TOKEN_UINT64_CONSTANT;
|
||||
}
|
||||
else {
|
||||
// No u or l suffix
|
||||
// First, see if we can fit this into a 32-bit integer...
|
||||
if (yylval.intVal <= 0x7fffffffULL)
|
||||
return TOKEN_INT32_CONSTANT;
|
||||
else if (yylval.intVal <= 0xffffffffULL)
|
||||
return TOKEN_UINT32_CONSTANT;
|
||||
else if (yylval.intVal <= 0x7fffffffffffffffULL)
|
||||
return TOKEN_INT64_CONSTANT;
|
||||
else
|
||||
return TOKEN_UINT64_CONSTANT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Handle a C-style comment in the source.
|
||||
*/
|
||||
static void
|
||||
lCComment(SourcePos *pos) {
|
||||
char c, prev = 0;
|
||||
|
||||
|
||||
while ((c = yyinput()) != 0) {
|
||||
++pos->last_column;
|
||||
|
||||
if (c == '\n') {
|
||||
pos->last_line++;
|
||||
pos->last_column = 1;
|
||||
@@ -373,6 +684,7 @@ static void lHandleCppHash(SourcePos *pos) {
|
||||
++src;
|
||||
}
|
||||
pos->name = strdup(filename.c_str());
|
||||
RegisterDependency(filename);
|
||||
}
|
||||
|
||||
|
||||
@@ -415,7 +727,7 @@ lEscapeChar(char *str, char *pChar, SourcePos *pos)
|
||||
str = tail - 1;
|
||||
break;
|
||||
default:
|
||||
Error(*pos, "Bad character escape sequence: '%s'\n.", str);
|
||||
Error(*pos, "Bad character escape sequence: '%s'.", str);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -435,7 +747,7 @@ lStringConst(YYSTYPE *yylval, SourcePos *pos)
|
||||
std::string str;
|
||||
p = strchr(yytext, '"') + 1;
|
||||
while (*p != '\"') {
|
||||
char cval;
|
||||
char cval = '\0';
|
||||
p = lEscapeChar(p, &cval, pos);
|
||||
str.push_back(cval);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user