Compare commits
883 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6c392ee4a1 | ||
|
|
7699eda5ba | ||
|
|
d8b5fd5409 | ||
|
|
b37ffdbe85 | ||
|
|
481bcc732b | ||
|
|
ce175aee4c | ||
|
|
50896b373b | ||
|
|
1a40f936df | ||
|
|
1024ba9b0f | ||
|
|
1a7ac8b804 | ||
|
|
7bedb4a081 | ||
|
|
630215f56f | ||
|
|
6f0e5fd402 | ||
|
|
66ec43739a | ||
|
|
44f9d1ed78 | ||
|
|
c6d479b8ad | ||
|
|
80e2f4e342 | ||
|
|
4b388edca9 | ||
|
|
5362dade37 | ||
|
|
3d24265d50 | ||
|
|
7057fc2930 | ||
|
|
e661ee95ff | ||
|
|
333b89fa07 | ||
|
|
04e888548e | ||
|
|
403d9e1059 | ||
|
|
4ea02c59d8 | ||
|
|
de7ba7a55b | ||
|
|
23ba61e76f | ||
|
|
c0aa7e0314 | ||
|
|
9f44e597d6 | ||
|
|
60c5bef90f | ||
|
|
a38fcf1127 | ||
|
|
f22e237381 | ||
|
|
b6b9daa3c5 | ||
|
|
d958b0b9d6 | ||
|
|
7b2eaf63af | ||
|
|
aef20b536a | ||
|
|
1a34b1410f | ||
|
|
5af2f80bc5 | ||
|
|
4d241736f0 | ||
|
|
a47460b4c3 | ||
|
|
32be338f60 | ||
|
|
549655bff4 | ||
|
|
3e18cec691 | ||
|
|
658dd3486b | ||
|
|
018e9a12a3 | ||
|
|
2027a6ac12 | ||
|
|
26bec62daf | ||
|
|
7497e86902 | ||
|
|
e084f1c311 | ||
|
|
95950885cf | ||
|
|
9cd84aeea9 | ||
|
|
d324ec247e | ||
|
|
cbb0d6ce06 | ||
|
|
d36ab4cc3c | ||
|
|
1069a3c77e | ||
|
|
36da23c9c5 | ||
|
|
e756daa261 | ||
|
|
65ac336211 | ||
|
|
14fe987956 | ||
|
|
c5acf239f2 | ||
|
|
a02500b112 | ||
|
|
8fea85a85c | ||
|
|
6f42bfc640 | ||
|
|
6b8741dbd7 | ||
|
|
ce5d4b6ccc | ||
|
|
eb2e5f378c | ||
|
|
9251ea2b22 | ||
|
|
cb650d6100 | ||
|
|
3f3d9219ea | ||
|
|
11528b0def | ||
|
|
a5f0e713d6 | ||
|
|
de3a864bd6 | ||
|
|
af551b3c09 | ||
|
|
5d64d23b61 | ||
|
|
269a00f9ec | ||
|
|
29c482789f | ||
|
|
a0462fe1ee | ||
|
|
78a840f48d | ||
|
|
7371d82bdf | ||
|
|
4c35d9456a | ||
|
|
0704081d91 | ||
|
|
5898532605 | ||
|
|
603abf70dc | ||
|
|
0a3822f2e5 | ||
|
|
f76eb2b7f5 | ||
|
|
08bbf5f5ef | ||
|
|
4ea08116b8 | ||
|
|
78e03c6402 | ||
|
|
4ab89de343 | ||
|
|
6db460fb81 | ||
|
|
be859df51e | ||
|
|
3c533f2ba4 | ||
|
|
9cf037c90f | ||
|
|
9e0425e824 | ||
|
|
2960479095 | ||
|
|
baf1ff033a | ||
|
|
52dcbf087a | ||
|
|
d1c6331924 | ||
|
|
0af2a13349 | ||
|
|
7f0c92eb4d | ||
|
|
0f86255279 | ||
|
|
b2517c8a18 | ||
|
|
95d0c5e67b | ||
|
|
0885b2bf23 | ||
|
|
706337bb5a | ||
|
|
3f8a678c5a | ||
|
|
0f631ad49b | ||
|
|
5bc3b4f768 | ||
|
|
e8bd464ab2 | ||
|
|
ef1af547e2 | ||
|
|
f2dcad27bb | ||
|
|
01992006b2 | ||
|
|
487820fb4d | ||
|
|
d760d67598 | ||
|
|
3cb827ac56 | ||
|
|
15c5dfc12b | ||
|
|
524939dc5b | ||
|
|
51fdff208e | ||
|
|
65a309648b | ||
|
|
7d08eeb8dd | ||
|
|
9e0428ba0d | ||
|
|
15461a2460 | ||
|
|
88f21b5c57 | ||
|
|
bee3029764 | ||
|
|
150d6d1f56 | ||
|
|
d03e4ac100 | ||
|
|
8d8d9c63fe | ||
|
|
52147ce631 | ||
|
|
5af41df8a5 | ||
|
|
775ecd6dfe | ||
|
|
b139235c62 | ||
|
|
8f2c910600 | ||
|
|
7e67f01d4b | ||
|
|
ad7e800446 | ||
|
|
6326924de7 | ||
|
|
801a3a6ff5 | ||
|
|
a4e94a26ba | ||
|
|
44e6be7914 | ||
|
|
3aaf2ef2d4 | ||
|
|
8f902fde9c | ||
|
|
b6023c517e | ||
|
|
42d77e9191 | ||
|
|
312a7582df | ||
|
|
dc939eba78 | ||
|
|
f8bec51de2 | ||
|
|
0bf1320a32 | ||
|
|
81dbd504aa | ||
|
|
63dd7d9859 | ||
|
|
2063d34f3e | ||
|
|
83fdc2e5ad | ||
|
|
6ba7368ab0 | ||
|
|
c2805942a9 | ||
|
|
9892c8bf9a | ||
|
|
23e5877509 | ||
|
|
8cbfde6092 | ||
|
|
24087ff3cc | ||
|
|
6827001c1d | ||
|
|
16b0806d40 | ||
|
|
2129b1e27d | ||
|
|
a267762f59 | ||
|
|
65ca795030 | ||
|
|
e82b649ec0 | ||
|
|
275cdb1713 | ||
|
|
d3b86dcc90 | ||
|
|
c736b75075 | ||
|
|
b601331362 | ||
|
|
32d44a5b9e | ||
|
|
810784da1f | ||
|
|
d517b37f3f | ||
|
|
adeef0af01 | ||
|
|
97ddc1ed10 | ||
|
|
bf580648a1 | ||
|
|
ecc54fa0eb | ||
|
|
04d32ae3e6 | ||
|
|
a18b3ae88e | ||
|
|
e57801a5d1 | ||
|
|
da4390aede | ||
|
|
9e85667219 | ||
|
|
b80867d473 | ||
|
|
d742dcce59 | ||
|
|
7a7af3d5f9 | ||
|
|
e323b1d0ad | ||
|
|
3c18c7a713 | ||
|
|
7c16292cb7 | ||
|
|
d665e2e85b | ||
|
|
172a189c6f | ||
|
|
406fbab40e | ||
|
|
09dc217f8c | ||
|
|
9002837750 | ||
|
|
411d5b44ef | ||
|
|
360cc8044e | ||
|
|
ec2e9b5e79 | ||
|
|
881dba61e4 | ||
|
|
6412876f64 | ||
|
|
538d51cbfe | ||
|
|
3dd9ff3d84 | ||
|
|
7f386923b0 | ||
|
|
d2312b1fbd | ||
|
|
6655373ac3 | ||
|
|
d492af7bc0 | ||
|
|
230a7b7374 | ||
|
|
4204a752f7 | ||
|
|
0e88d5f97f | ||
|
|
a13e7f2435 | ||
|
|
be2108260e | ||
|
|
59b0a2b208 | ||
|
|
05a5a42a08 | ||
|
|
f0b0618484 | ||
|
|
4ecdbe4bd9 | ||
|
|
9e9f266e52 | ||
|
|
0ce67f37ac | ||
|
|
ddcd0a49ec | ||
|
|
63b8fac852 | ||
|
|
def8d7850b | ||
|
|
0442efc856 | ||
|
|
f928bbb53c | ||
|
|
1ab7500dbb | ||
|
|
c58d92d46b | ||
|
|
8276e912fd | ||
|
|
e0490d0df5 | ||
|
|
11db466a88 | ||
|
|
caaee0b666 | ||
|
|
f2f470f369 | ||
|
|
09bb36f58c | ||
|
|
21719df6fd | ||
|
|
39329809dd | ||
|
|
44797e2925 | ||
|
|
c8f373d119 | ||
|
|
8a22c63889 | ||
|
|
1a4434d314 | ||
|
|
165a13b13e | ||
|
|
43364b2d69 | ||
|
|
6eaecd20d5 | ||
|
|
c80bfeacf6 | ||
|
|
2a19cc1758 | ||
|
|
8f5189f606 | ||
|
|
49dde7c6f2 | ||
|
|
765a0d8896 | ||
|
|
19d8f2e258 | ||
|
|
e6aec96e05 | ||
|
|
a2d42c3242 | ||
|
|
52836aae87 | ||
|
|
bda566d6a7 | ||
|
|
63ed90b0fd | ||
|
|
0bb4d282e2 | ||
|
|
ae89a65dad | ||
|
|
e9fe9f5043 | ||
|
|
ce8dc5927c | ||
|
|
f6989cce38 | ||
|
|
6dbbf9aa80 | ||
|
|
fe6282e837 | ||
|
|
51210a869b | ||
|
|
658652a9ff | ||
|
|
aecd6e0878 | ||
|
|
1334a84861 | ||
|
|
6a410fc30e | ||
|
|
984a68c3a9 | ||
|
|
daf5aa8e8b | ||
|
|
98b2e0e426 | ||
|
|
9a1932eaf7 | ||
|
|
371d4be8ef | ||
|
|
d180031ef0 | ||
|
|
e09e953bbb | ||
|
|
2c640f7e52 | ||
|
|
2bacebb1fb | ||
|
|
df18b2a150 | ||
|
|
216ac4b1a4 | ||
|
|
898cded646 | ||
|
|
c09c87873e | ||
|
|
10b79fb41b | ||
|
|
ec0280be11 | ||
|
|
8e19d54e75 | ||
|
|
3c070e5e20 | ||
|
|
dde599f48f | ||
|
|
cc15ecfb3a | ||
|
|
7a7c54bd59 | ||
|
|
bea88ab122 | ||
|
|
926b3b9ee3 | ||
|
|
bc7775aef2 | ||
|
|
107669686c | ||
|
|
bb11b3ab66 | ||
|
|
516ba85abd | ||
|
|
098277b4f0 | ||
|
|
950a989744 | ||
|
|
fb8b893b10 | ||
|
|
9ca80debb8 | ||
|
|
080241b7d1 | ||
|
|
0d534720bb | ||
|
|
1dc4424a30 | ||
|
|
57f0cf30c0 | ||
|
|
8ef6bc1636 | ||
|
|
974b40c8af | ||
|
|
45e9e0be0b | ||
|
|
ec0918045d | ||
|
|
38bcecd2f3 | ||
|
|
aabbdba068 | ||
|
|
84c183da1f | ||
|
|
b363b98211 | ||
|
|
8defbeb248 | ||
|
|
f52d227d80 | ||
|
|
78cb45fb25 | ||
|
|
2d8026625b | ||
|
|
73afab464f | ||
|
|
8aa139b6be | ||
|
|
e5fe0eabdc | ||
|
|
0d3993fa25 | ||
|
|
ac421f68e2 | ||
|
|
b9d1f0db18 | ||
|
|
6aad4c7a39 | ||
|
|
4186ef204d | ||
|
|
ae7a094ee0 | ||
|
|
3a007f939a | ||
|
|
b8503b9255 | ||
|
|
b7bc76d3cc | ||
|
|
27d6c12972 | ||
|
|
b69d783e09 | ||
|
|
3b2ff6301c | ||
|
|
6c7043916e | ||
|
|
96a6e75b71 | ||
|
|
a91e4e7981 | ||
|
|
95d8f76ec3 | ||
|
|
66d4c2ddd9 | ||
|
|
8115ca739a | ||
|
|
ec4021bbf4 | ||
|
|
e431b07e04 | ||
|
|
d34a87404d | ||
|
|
f38770bf2a | ||
|
|
dc9998ccaf | ||
|
|
f1b3703389 | ||
|
|
b6a8d0ee7f | ||
|
|
2a4dff38d0 | ||
|
|
665c564dcf | ||
|
|
ed71413e04 | ||
|
|
4b5e49b00b | ||
|
|
f558ee788e | ||
|
|
ceb8ca680c | ||
|
|
79ebcbec4b | ||
|
|
2c7b650240 | ||
|
|
54459255d4 | ||
|
|
b4a078e2f6 | ||
|
|
ed13dd066b | ||
|
|
2b4a3b22bf | ||
|
|
8b891da628 | ||
|
|
5a2c8342eb | ||
|
|
50eb4bf53a | ||
|
|
3c10ddd46a | ||
|
|
0b7f9acc70 | ||
|
|
10fbaec247 | ||
|
|
007a734595 | ||
|
|
46716aada3 | ||
|
|
3bc66136b2 | ||
|
|
fae47e0dfc | ||
|
|
bd52e86486 | ||
|
|
b2f6ed7209 | ||
|
|
4b334fd2e2 | ||
|
|
a23a7006e3 | ||
|
|
f47171a17c | ||
|
|
4945dc3682 | ||
|
|
ada66b5313 | ||
|
|
96450e17a3 | ||
|
|
40a295e951 | ||
|
|
d6c6f95373 | ||
|
|
19b46be20d | ||
|
|
789e04ce90 | ||
|
|
dd4f0a600b | ||
|
|
6c7df4cb6b | ||
|
|
79e0a9f32a | ||
|
|
6c9bc63a1c | ||
|
|
28a821df7d | ||
|
|
27e39954d6 | ||
|
|
e730a5364b | ||
|
|
92b3ae41dd | ||
|
|
89a2566e01 | ||
|
|
1ac3e03171 | ||
|
|
b86d40091a | ||
|
|
91d22d150f | ||
|
|
1d29991268 | ||
|
|
6f0a2686dc | ||
|
|
f06caabb07 | ||
|
|
3c869802fb | ||
|
|
7b6bd90903 | ||
|
|
967bfa9c92 | ||
|
|
592affb984 | ||
|
|
96aaf6d53b | ||
|
|
1397dbdabc | ||
|
|
6118643232 | ||
|
|
71198a0b54 | ||
|
|
22cb80399f | ||
|
|
fa1fd8a576 | ||
|
|
6df7d31a5b | ||
|
|
ef049e92ef | ||
|
|
fe8b109ca5 | ||
|
|
8fd9b84a80 | ||
|
|
5cb53f52c3 | ||
|
|
d86653668e | ||
|
|
5084712a15 | ||
|
|
ece65cab18 | ||
|
|
1f6075506c | ||
|
|
51ade48e3d | ||
|
|
21c43737fe | ||
|
|
6c7bcf00e7 | ||
|
|
7a2142075c | ||
|
|
e8e9baa417 | ||
|
|
449d956966 | ||
|
|
90db01d038 | ||
|
|
38cea6dc71 | ||
|
|
64807dfb3b | ||
|
|
d943455e10 | ||
|
|
fd03ba7586 | ||
|
|
2c5a57e386 | ||
|
|
e8858150cb | ||
|
|
333f901187 | ||
|
|
7dd4d6c75e | ||
|
|
99f57cfda6 | ||
|
|
4d1eb94dfd | ||
|
|
22d584f302 | ||
|
|
72c41f104e | ||
|
|
8d3ac3ac1e | ||
|
|
299ae186f1 | ||
|
|
f4df2fb176 | ||
|
|
625fbef613 | ||
|
|
fbed0ac56b | ||
|
|
dc120f3962 | ||
|
|
4f053e5b83 | ||
|
|
c6241581a0 | ||
|
|
041ade66d5 | ||
|
|
067a2949ba | ||
|
|
55c754750e | ||
|
|
72b6c12856 | ||
|
|
15ea0af687 | ||
|
|
ee7e367981 | ||
|
|
8006589828 | ||
|
|
413264eaae | ||
|
|
7db8824da2 | ||
|
|
e1bc010bd1 | ||
|
|
bff02017da | ||
|
|
c0019bd8e5 | ||
|
|
e495ef2c48 | ||
|
|
78d62705cc | ||
|
|
2791bd0015 | ||
|
|
7cf66eb61f | ||
|
|
944c53bff1 | ||
|
|
c756c855ea | ||
|
|
58bb2826b2 | ||
|
|
b7bef87a4d | ||
|
|
0c1b206185 | ||
|
|
7d7e99a92c | ||
|
|
1ba8d7ef74 | ||
|
|
d99bd279e8 | ||
|
|
ee1fe3aa9f | ||
|
|
c4b1d79c5c | ||
|
|
a1a43cdfe0 | ||
|
|
27b62781cc | ||
|
|
0c5d7ff8f2 | ||
|
|
0e2b315ded | ||
|
|
3e74d1c544 | ||
|
|
da690acce5 | ||
|
|
0baa2b484d | ||
|
|
260d7298c3 | ||
|
|
d5cc2ad643 | ||
|
|
12706cd37f | ||
|
|
7167442d6e | ||
|
|
8547101c4b | ||
|
|
5d58a9e4c2 | ||
|
|
cd98a29a4b | ||
|
|
903714fd40 | ||
|
|
138c7acf22 | ||
|
|
03b2b8ae8f | ||
|
|
016b502d46 | ||
|
|
c5f6653564 | ||
|
|
cf9a4e209e | ||
|
|
040421942f | ||
|
|
4dfc596d38 | ||
|
|
fe83ef7635 | ||
|
|
db8b08131f | ||
|
|
32815e628d | ||
|
|
71bdc67a45 | ||
|
|
cb9f50ef63 | ||
|
|
12c754c92b | ||
|
|
e4b3d03da5 | ||
|
|
cc26b66e99 | ||
|
|
34d81fa522 | ||
|
|
49f1a5c2b3 | ||
|
|
326c45fa17 | ||
|
|
a2bb899a6b | ||
|
|
9fedb1674e | ||
|
|
7c91b01125 | ||
|
|
c202e9e106 | ||
|
|
645a8c9349 | ||
|
|
093fdcf3df | ||
|
|
7abda5e8c2 | ||
|
|
abf7c423bb | ||
|
|
55d5c07d00 | ||
|
|
0a9b272fe4 | ||
|
|
b9d6ba2aa0 | ||
|
|
a0c9f7823b | ||
|
|
4477a9c59a | ||
|
|
99a27fe241 | ||
|
|
fefa86e0cf | ||
|
|
098c4910de | ||
|
|
17b7148300 | ||
|
|
f4a2ef28e3 | ||
|
|
f0d013ee76 | ||
|
|
5ece6fec04 | ||
|
|
d88dbf3612 | ||
|
|
2a18efef82 | ||
|
|
fd846fbe77 | ||
|
|
ca7cc4744e | ||
|
|
491fa239bd | ||
|
|
66765dc123 | ||
|
|
70a5348f43 | ||
|
|
2aa61007c6 | ||
|
|
acfbe77ffc | ||
|
|
08696653ca | ||
|
|
8a1a214ca9 | ||
|
|
7aaeb27e0f | ||
|
|
972043c146 | ||
|
|
8475dc082a | ||
|
|
d0e583b29c | ||
|
|
c8feee238b | ||
|
|
6712ecd928 | ||
|
|
d0c7b5d35c | ||
|
|
802add1f97 | ||
|
|
95556811fa | ||
|
|
581472564d | ||
|
|
c7dc8862a5 | ||
|
|
4f8cf019ca | ||
|
|
4c9ac7fcf1 | ||
|
|
1dac05960a | ||
|
|
c27418da77 | ||
|
|
637d076e99 | ||
|
|
391678a5b3 | ||
|
|
4cd0cf1650 | ||
|
|
b813452d33 | ||
|
|
eb85da81e1 | ||
|
|
920cf63201 | ||
|
|
dc09d46bf4 | ||
|
|
05d1b06eeb | ||
|
|
c1661eb06b | ||
|
|
e9626a1d10 | ||
|
|
560bf5ca09 | ||
|
|
512f8d8b60 | ||
|
|
87c8a89349 | ||
|
|
255791f18e | ||
|
|
d5e3416e8e | ||
|
|
5b2d43f665 | ||
|
|
540fc6c2f3 | ||
|
|
b3c5043dcc | ||
|
|
d0d9aae968 | ||
|
|
3270e2bf5a | ||
|
|
013a3e7567 | ||
|
|
8368ba8539 | ||
|
|
ca0310e335 | ||
|
|
4690a678c1 | ||
|
|
f8a39402a2 | ||
|
|
b923e4daea | ||
|
|
247775d1ec | ||
|
|
6e9fea377d | ||
|
|
ca5c65d032 | ||
|
|
f9dc621ebe | ||
|
|
ffe484c31e | ||
|
|
62cd3418ca | ||
|
|
d8a8f3a996 | ||
|
|
0ad8dbbfc9 | ||
|
|
e15a1946c6 | ||
|
|
8878826661 | ||
|
|
95a8b6e5e8 | ||
|
|
388d0d2cfd | ||
|
|
d3a374e71c | ||
|
|
1da2834b1e | ||
|
|
ca3100874f | ||
|
|
117f48a331 | ||
|
|
89bbceefee | ||
|
|
7e18f0e247 | ||
|
|
29c2f24faf | ||
|
|
3bb2dee275 | ||
|
|
88cd5584e8 | ||
|
|
41f9ce2560 | ||
|
|
20044f5749 | ||
|
|
833f0a6aa7 | ||
|
|
10c5ba140c | ||
|
|
316de0b880 | ||
|
|
989966f81b | ||
|
|
ccd550dc52 | ||
|
|
ddf350839a | ||
|
|
6a7dd2787a | ||
|
|
385771e73e | ||
|
|
349ab0b9c5 | ||
|
|
b5e6c6a2f3 | ||
|
|
2832ea641f | ||
|
|
cb7edf2725 | ||
|
|
f1f1be2822 | ||
|
|
7dffd65609 | ||
|
|
2c8a44e28b | ||
|
|
39bb95a6ee | ||
|
|
da9dba80a0 | ||
|
|
12f3285f9b | ||
|
|
7e954e4248 | ||
|
|
d74cc6397b | ||
|
|
777343331e | ||
|
|
a062653743 | ||
|
|
57af0eb64f | ||
|
|
60aae16752 | ||
|
|
e264d95019 | ||
|
|
0664f5a724 | ||
|
|
17c6a19527 | ||
|
|
cbc8b8259b | ||
|
|
1067a2e4be | ||
|
|
74a031a759 | ||
|
|
ee437193fb | ||
|
|
436c53037e | ||
|
|
f55ba9d3cb | ||
|
|
8adb99b768 | ||
|
|
13c42412d2 | ||
|
|
75507d8b35 | ||
|
|
ddfe4932ac | ||
|
|
cf208cc2e3 | ||
|
|
28ac016928 | ||
|
|
f4ae41d006 | ||
|
|
9ec8e5a275 | ||
|
|
a473046058 | ||
|
|
a69b7a5a01 | ||
|
|
640918bcc0 | ||
|
|
f39fbdb3fc | ||
|
|
50d4d81062 | ||
|
|
3b95452481 | ||
|
|
c152ae3c32 | ||
|
|
f6cbaa78e8 | ||
|
|
7adb250b59 | ||
|
|
db5db5aefd | ||
|
|
8fdf84de04 | ||
|
|
ff5cbe80d1 | ||
|
|
e013e0a374 | ||
|
|
b7df312ca7 | ||
|
|
ce82c3c0ae | ||
|
|
2f958cfbda | ||
|
|
8ef41dfd97 | ||
|
|
3082ea4765 | ||
|
|
e482d29951 | ||
|
|
ff48dd7bfb | ||
|
|
7bf9c11822 | ||
|
|
f7937f1e4b | ||
|
|
0115eeabfe | ||
|
|
4b9c3ec0da | ||
|
|
55b81e35a7 | ||
|
|
2a1c7f2d47 | ||
|
|
8603f9838f | ||
|
|
95224f3f11 | ||
|
|
f81acbfe80 | ||
|
|
6d7ff7eba2 | ||
|
|
ad429db7e8 | ||
|
|
4c07abbaf4 | ||
|
|
e3c0551129 | ||
|
|
8971baa42b | ||
|
|
317a1f51f7 | ||
|
|
c63d139482 | ||
|
|
9e682362e9 | ||
|
|
56ec939692 | ||
|
|
a86b942730 | ||
|
|
52eb4c6014 | ||
|
|
f4adbbf90c | ||
|
|
cc86e4a7d2 | ||
|
|
e864447e4a | ||
|
|
73bf552cd6 | ||
|
|
f20a2d2ee9 | ||
|
|
0c25bc063c | ||
|
|
db72781d2a | ||
|
|
0c8ad09040 | ||
|
|
49880ab761 | ||
|
|
fe2d9aa600 | ||
|
|
1dead425e4 | ||
|
|
adb1e47a59 | ||
|
|
ffba8580c1 | ||
|
|
ea18427d29 | ||
|
|
97d42f5c53 | ||
|
|
f3089df086 | ||
|
|
157e7c97ae | ||
|
|
bb8e13e3c9 | ||
|
|
5b4673e8eb | ||
|
|
5b9de8cc07 | ||
|
|
33ea934c8f | ||
|
|
6b3e14b0a4 | ||
|
|
098ceb5567 | ||
|
|
8e2b0632e8 | ||
|
|
420d373d89 | ||
|
|
a59fd7eeb3 | ||
|
|
ee91fa1228 | ||
|
|
a2b5ce0172 | ||
|
|
3efbc71a01 | ||
|
|
b7c5af7e64 | ||
|
|
f939015b97 | ||
|
|
a9ed71f553 | ||
|
|
96a429694f | ||
|
|
fddc5e022e | ||
|
|
2236d53def | ||
|
|
4e018d0a20 | ||
|
|
977b983771 | ||
|
|
fa7a7fe23e | ||
|
|
724a843bbd | ||
|
|
a9ec745275 | ||
|
|
c2ecc15b93 | ||
|
|
83c8650b36 | ||
|
|
89cb809922 | ||
|
|
fdb4eaf437 | ||
|
|
0432f97555 | ||
|
|
8d1631b714 | ||
|
|
dac091552d | ||
|
|
ea027a95a8 | ||
|
|
f73abb05a7 | ||
|
|
d71c49494f | ||
|
|
25665f0841 | ||
|
|
1eec27f890 | ||
|
|
950f86200b | ||
|
|
e19f4931d1 | ||
|
|
0575b1f38d | ||
|
|
f6cd01f7cf | ||
|
|
f2fbc168af | ||
|
|
b50f6f1730 | ||
|
|
f8a7120d9c | ||
|
|
20dbf59420 | ||
|
|
c67a286aa6 | ||
|
|
c96fef6bc8 | ||
|
|
bba02f87ea | ||
|
|
12dc3f5c28 | ||
|
|
0f01a5dcbe | ||
|
|
664dc3bdda | ||
|
|
bdba3cd97d | ||
|
|
d9c0f9315a | ||
|
|
b7f17d435f | ||
|
|
37cdc18639 | ||
|
|
5893a9c49d | ||
|
|
24f58fa16a | ||
|
|
56ffc78fa4 | ||
|
|
061e68bc77 | ||
|
|
177e6312b4 | ||
|
|
1acf4032c2 | ||
|
|
0db752f3a2 | ||
|
|
9c5444698e | ||
|
|
65f3252760 | ||
|
|
e612abe4ba | ||
|
|
ee8b6ebbf6 | ||
|
|
34352e4e0e | ||
|
|
1867b5b317 | ||
|
|
a5b7fca7e0 | ||
|
|
7be2c399b1 | ||
|
|
d6337b3b22 | ||
|
|
d2f8b0ace5 | ||
|
|
d805e8b183 | ||
|
|
1f0f2ec05f | ||
|
|
91ac3b9d7c | ||
|
|
d65bf2eb2f | ||
|
|
1bba9d4307 | ||
|
|
4388338dad | ||
|
|
2fb59c90cf | ||
|
|
68f6ea8def | ||
|
|
3f89295d10 | ||
|
|
748b292e77 | ||
|
|
6451c3d99d | ||
|
|
d14a2de168 | ||
|
|
642150095d | ||
|
|
3bf3ac7922 | ||
|
|
c6d1cebad4 | ||
|
|
08189ce08c | ||
|
|
7013d7d52f | ||
|
|
7045b76f84 | ||
|
|
58a0b4a20d | ||
|
|
0f8eee9809 | ||
|
|
0740299860 | ||
|
|
652215861e | ||
|
|
602209e5a8 | ||
|
|
b60f8b4f70 | ||
|
|
f2b99ccb08 | ||
|
|
b67446d998 | ||
|
|
9670ab0887 | ||
|
|
0223bb85ee | ||
|
|
fd81255db1 | ||
|
|
8a8e1a7f73 | ||
|
|
ef05fbf424 | ||
|
|
fa01b63fa5 | ||
|
|
63d3d25030 | ||
|
|
a8db866228 | ||
|
|
0519eea951 | ||
|
|
f4653ecd11 | ||
|
|
5d67252ed0 | ||
|
|
5134de71c0 | ||
|
|
2be1251c70 | ||
|
|
c0161aa17f | ||
|
|
b683aa11b1 | ||
|
|
2654bb0112 | ||
|
|
d8728104b4 | ||
|
|
0be1b70fba | ||
|
|
a0e9793de3 | ||
|
|
da9200fcee | ||
|
|
54e8e8022b | ||
|
|
d84cf781da | ||
|
|
002f27a30f | ||
|
|
86d88e9773 | ||
|
|
fda00afe6e | ||
|
|
be0c77d556 | ||
|
|
0ed11a7832 | ||
|
|
ff6971fb15 | ||
|
|
5b4dbc8167 | ||
|
|
59f4c9985e | ||
|
|
8da9be1a09 | ||
|
|
11033e108e | ||
|
|
4f97262cf2 | ||
|
|
9b68b9087a | ||
|
|
15cc812e37 | ||
|
|
71317e6aa6 | ||
|
|
1abaaee73e | ||
|
|
78c6d3c02f | ||
|
|
48e9d4af39 | ||
|
|
cb7ad371c6 | ||
|
|
2951589825 | ||
|
|
f23dc5366a | ||
|
|
e3341176c5 | ||
|
|
8938e14442 | ||
|
|
4151778f5e | ||
|
|
23b85cd88d | ||
|
|
234e5cd3e1 | ||
|
|
f75c94a8f1 | ||
|
|
848a432640 | ||
|
|
dea13979e0 | ||
|
|
052d34bf5b | ||
|
|
d4c5e82896 | ||
|
|
562d61caff | ||
|
|
75f18c7c66 | ||
|
|
5d35349dc9 | ||
|
|
1a81173c93 | ||
|
|
1d9201fe3d | ||
|
|
6dbb15027a | ||
|
|
f23d030e43 | ||
|
|
701334ccf2 | ||
|
|
f48a662ed3 | ||
|
|
ced3f1f5fc | ||
|
|
018aa96c8b | ||
|
|
34eda04d9b | ||
|
|
45767ad197 | ||
|
|
f9463af75b | ||
|
|
6f6e28077f | ||
|
|
0a9a7c939a | ||
|
|
f30a5dea79 | ||
|
|
018b547c40 | ||
|
|
e82a720223 | ||
|
|
8d1b77b235 | ||
|
|
b8987faeee | ||
|
|
17fdab2793 | ||
|
|
1fa6520cb6 | ||
|
|
b6af5c16c6 | ||
|
|
10ebe88abf | ||
|
|
c0b41ad6f5 | ||
|
|
9920b30318 | ||
|
|
07f218137a | ||
|
|
89a5248f4f | ||
|
|
891919074e | ||
|
|
4adf527a4d | ||
|
|
533b539780 | ||
|
|
6f26ae9801 | ||
|
|
ddcdfff3ae | ||
|
|
5b48354d9a | ||
|
|
46bfef3fce | ||
|
|
20536bb339 | ||
|
|
f6605ee465 | ||
|
|
034507a35b | ||
|
|
0b2febcec0 | ||
|
|
d2fa735ef1 | ||
|
|
20f34b67da | ||
|
|
03f3db1e89 | ||
|
|
9805b0742d | ||
|
|
6000c696b2 | ||
|
|
5a2edf723b | ||
|
|
aec7da740a | ||
|
|
a79bc75b72 | ||
|
|
eaaebf7928 | ||
|
|
198aa9620e | ||
|
|
27c53a3c25 | ||
|
|
bd70182369 | ||
|
|
04df63d955 | ||
|
|
d59131d670 | ||
|
|
9475e13d81 | ||
|
|
765d86076f | ||
|
|
e2b6ed3db8 |
9
.gitignore
vendored
9
.gitignore
vendored
@@ -5,4 +5,11 @@ ispc
|
||||
ispc_test
|
||||
objs
|
||||
docs/doxygen
|
||||
docs/ispc.html
|
||||
docs/*.html
|
||||
tests*/*cpp
|
||||
tests*/*run
|
||||
examples/*/*.png
|
||||
examples/*/*.ppm
|
||||
examples/*/objs/*
|
||||
|
||||
|
||||
|
||||
228
Makefile
228
Makefile
@@ -1,43 +1,109 @@
|
||||
#
|
||||
# Copyright (c) 2010-2013, Intel Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#
|
||||
# ispc Makefile
|
||||
#
|
||||
|
||||
# If you have your own special version of llvm and/or clang, change
|
||||
# these variables to match.
|
||||
LLVM_CONFIG=$(shell which llvm-config)
|
||||
CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
|
||||
|
||||
# Add llvm bin to the path so any scripts run will go to the right llvm-config
|
||||
LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
|
||||
export PATH:=$(LLVM_BIN):$(PATH)
|
||||
|
||||
ARCH_OS = $(shell uname)
|
||||
ifeq ($(ARCH_OS), Darwin)
|
||||
ARCH_OS2 = "OSX"
|
||||
else
|
||||
ARCH_OS2 = $(shell uname -o)
|
||||
endif
|
||||
ARCH_TYPE = $(shell arch)
|
||||
|
||||
LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs engine ipo bitreader bitwriter instrumentation linker)
|
||||
|
||||
CLANG=clang
|
||||
CLANG_LIBS = -lclangFrontend -lclangDriver \
|
||||
-lclangSerialization -lclangParse -lclangSema \
|
||||
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
|
||||
-lclangAnalysis -lclangAST -lclangBasic \
|
||||
-lclangEdit -lclangLex
|
||||
|
||||
ISPC_LIBS=$(CLANG_LIBS) \
|
||||
$(shell llvm-config --ldflags --libs) \
|
||||
-lpthread -ldl
|
||||
ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
|
||||
-lpthread -ldl
|
||||
ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
|
||||
-lpthread
|
||||
|
||||
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
|
||||
LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
|
||||
LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
ISPC_LIBS += -ldl
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_OS2),Msys)
|
||||
ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
|
||||
endif
|
||||
|
||||
LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
|
||||
LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
|
||||
LLVM_VERSION_DEF=-D$(LLVM_VERSION)
|
||||
|
||||
# Define build time stamp and revision.
|
||||
# For revision we use GIT or SVN info.
|
||||
BUILD_DATE=$(shell date +%Y%m%d)
|
||||
BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
|
||||
GIT_REVISION:=$(shell git log --abbrev-commit --abbrev=16 2>/dev/null | head -1)
|
||||
ifeq (${GIT_REVISION},)
|
||||
SVN_REVISION:=$(shell svn log -l 1 2>/dev/null | grep -o \^r[[:digit:]]\* )
|
||||
ifeq (${SVN_REVISION},)
|
||||
# Failed to get revision info
|
||||
BUILD_VERSION:="no_version_info"
|
||||
else
|
||||
# SVN revision info
|
||||
BUILD_VERSION:=$(SVN_REVISION)
|
||||
endif
|
||||
else
|
||||
# GIT revision info
|
||||
BUILD_VERSION:=$(GIT_REVISION)
|
||||
endif
|
||||
|
||||
CXX=g++
|
||||
CPP=cpp
|
||||
OPT=-g3
|
||||
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
|
||||
OPT=-O2
|
||||
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \
|
||||
-Wall $(LLVM_VERSION_DEF) \
|
||||
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
|
||||
|
||||
LDFLAGS=
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
# try to link everything statically under Linux (including libstdc++) so
|
||||
# that the binaries we generate will be portable across distributions...
|
||||
ifeq ($(ARCH_TYPE),x86_64)
|
||||
LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
|
||||
else
|
||||
LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
|
||||
endif
|
||||
# LDFLAGS=-static
|
||||
endif
|
||||
|
||||
LEX=flex
|
||||
@@ -45,28 +111,37 @@ YACC=bison -d -v -t
|
||||
|
||||
###########################################################################
|
||||
|
||||
CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
|
||||
llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
|
||||
util.cpp
|
||||
CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
|
||||
ispc.cpp llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp \
|
||||
type.cpp util.cpp
|
||||
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
|
||||
builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
|
||||
TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
|
||||
generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
|
||||
# These files need to be compiled in two versions - 32 and 64 bits.
|
||||
BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
|
||||
# These are files to be compiled in single version.
|
||||
BUILTINS_SRC_COMMON=builtins/dispatch.ll
|
||||
BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bit.o)))
|
||||
BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o)))
|
||||
BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \
|
||||
$(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \
|
||||
builtins-c-32.cpp builtins-c-64.cpp
|
||||
BISON_SRC=parse.yy
|
||||
FLEX_SRC=lex.ll
|
||||
|
||||
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
|
||||
builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
|
||||
$(FLEX_SRC:.ll=.o))
|
||||
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
|
||||
stdlib_generic_ispc.o stdlib_x86_ispc.o \
|
||||
$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
|
||||
|
||||
default: ispc ispc_test
|
||||
default: ispc
|
||||
|
||||
.PHONY: dirs clean depend doxygen print_llvm_src
|
||||
.PHONY: dirs clean depend doxygen print_llvm_src llvm_check
|
||||
.PRECIOUS: objs/builtins-%.cpp
|
||||
|
||||
depend: $(CXX_SRC) $(HEADERS)
|
||||
depend: llvm_check $(CXX_SRC) $(HEADERS)
|
||||
@echo Updating dependencies
|
||||
@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
|
||||
@$(CXX) -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
|
||||
|
||||
-include depend
|
||||
|
||||
@@ -74,11 +149,20 @@ dirs:
|
||||
@echo Creating objs/ directory
|
||||
@/bin/mkdir -p objs
|
||||
|
||||
print_llvm_src:
|
||||
llvm_check:
|
||||
@llvm-config --version > /dev/null || \
|
||||
(echo; \
|
||||
echo "******************************************"; \
|
||||
echo "ERROR: llvm-config not found in your PATH"; \
|
||||
echo "******************************************"; \
|
||||
echo; exit 1)
|
||||
|
||||
print_llvm_src: llvm_check
|
||||
@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
|
||||
@echo Using compiler to build: `$(CXX) --version | head -1`
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs ispc ispc_test
|
||||
/bin/rm -rf objs ispc
|
||||
|
||||
doxygen:
|
||||
/bin/rm -rf docs/doxygen
|
||||
@@ -86,16 +170,37 @@ doxygen:
|
||||
|
||||
ispc: print_llvm_src dirs $(OBJS)
|
||||
@echo Creating ispc executable
|
||||
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
|
||||
@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
|
||||
|
||||
ispc_test: dirs ispc_test.cpp
|
||||
@echo Creating ispc_test executable
|
||||
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
|
||||
# Use clang as a default compiler, instead of gcc
|
||||
clang: ispc
|
||||
clang: CXX=clang++
|
||||
|
||||
# Build ispc with address sanitizer instrumentation using clang compiler
|
||||
# Note that this is not portable build
|
||||
asan: clang
|
||||
asan: OPT+=-fsanitize=address
|
||||
|
||||
# Do debug build, i.e. -O0 -g
|
||||
debug: ispc
|
||||
debug: OPT=-O0 -g
|
||||
|
||||
objs/%.o: %.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/cbackend.o: cbackend.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/opt.o: opt.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/%.o: objs/%.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/parse.cc: parse.yy
|
||||
@echo Running bison on $<
|
||||
@$(YACC) -o $@ $<
|
||||
@@ -112,41 +217,32 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-%.cpp: builtins-%.ll
|
||||
@echo Creating C++ source from builtin definitions file $<
|
||||
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
||||
|
||||
objs/builtins-%.o: objs/builtins-%.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-c-32.cpp: builtins-c.c
|
||||
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll)
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
|
||||
|
||||
objs/builtins-c-32.o: objs/builtins-c-32.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
|
||||
@echo Creating C++ source from builtins definition file $< \(32 bit version\)
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
|
||||
|
||||
objs/builtins-c-64.cpp: builtins-c.c
|
||||
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
|
||||
@echo Creating C++ source from builtins definition file $< \(64 bit version\)
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
|
||||
|
||||
objs/builtins-c-32.cpp: builtins/builtins.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
|
||||
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 32 > $@
|
||||
|
||||
objs/builtins-c-64.o: objs/builtins-c-64.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
objs/builtins-c-64.cpp: builtins/builtins.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@
|
||||
|
||||
objs/stdlib_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $<
|
||||
@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
|
||||
objs/stdlib_generic_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $< for generic
|
||||
@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
|
||||
python stdlib2cpp.py generic > $@
|
||||
|
||||
objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
|
||||
objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
|
||||
objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
|
||||
objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
|
||||
objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
|
||||
objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll
|
||||
objs/stdlib_x86_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $< for x86
|
||||
@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
|
||||
python stdlib2cpp.py x86 > $@
|
||||
|
||||
90
README.rst
Normal file
90
README.rst
Normal file
@@ -0,0 +1,90 @@
|
||||
==============================
|
||||
Intel(r) SPMD Program Compiler
|
||||
==============================
|
||||
|
||||
``ispc`` is a compiler for a variant of the C programming language, with
|
||||
extensions for `single program, multiple data
|
||||
<http://en.wikipedia.org/wiki/SPMD>`_ programming. Under the SPMD model,
|
||||
the programmer writes a program that generally appears to be a regular
|
||||
serial program, though the execution model is actually that a number of
|
||||
*program instances* execute in parallel on the hardware.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
``ispc`` compiles a C-based SPMD programming language to run on the SIMD
|
||||
units of CPUs; it frequently provides a 3x or more speedup on CPUs with
|
||||
4-wide vector SSE units and 5x-6x on CPUs with 8-wide AVX vector units,
|
||||
without any of the difficulty of writing intrinsics code. Parallelization
|
||||
across multiple cores is also supported by ``ispc``, making it
|
||||
possible to write programs that achieve performance improvement that scales
|
||||
by both number of cores and vector unit size.
|
||||
|
||||
There are a few key principles in the design of ``ispc``:
|
||||
|
||||
* To build a small set of extensions to the C language that
|
||||
would deliver excellent performance to performance-oriented
|
||||
programmers who want to run SPMD programs on the CPU.
|
||||
|
||||
* To provide a thin abstraction layer between the programmer
|
||||
and the hardware--in particular, to have an execution and
|
||||
data model where the programmer can cleanly reason about the
|
||||
mapping of their source program to compiled assembly language
|
||||
and the underlying hardware.
|
||||
|
||||
* To make it possible to harness the computational power of SIMD
|
||||
vector units without the extremely low-programmer-productivity
|
||||
activity of directly writing intrinsics.
|
||||
|
||||
* To explore opportunities from close coupling between C/C++
|
||||
application code and SPMD ``ispc`` code running on the
|
||||
same processor--to have lightweight function calls between
|
||||
the two languages and to share data directly via pointers without
|
||||
copying or reformatting.
|
||||
|
||||
``ispc`` is an open source compiler with the BSD license. It uses the
|
||||
remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
|
||||
code generation and optimization and is `hosted on
|
||||
github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
|
||||
Linux, with both x86 and x86-64 targets. It currently supports the SSE2,
|
||||
SSE4, AVX1, and AVX2 instruction sets.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
``ispc`` provides a number of key features to developers:
|
||||
|
||||
* Familiarity as an extension of the C programming
|
||||
language: ``ispc`` supports familiar C syntax and
|
||||
programming idioms, while adding the ability to write SPMD
|
||||
programs.
|
||||
|
||||
* High-quality SIMD code generation: the performance
|
||||
of code generated by ``ispc`` is often close to that of
|
||||
hand-written intrinsics code.
|
||||
|
||||
* Ease of adoption with existing software
|
||||
systems: functions written in ``ispc`` directly
|
||||
interoperate with application functions written in C/C++ and
|
||||
with application data structures.
|
||||
|
||||
* Portability across over a decade of CPU
|
||||
generations: ``ispc`` has targets for SSE2, SSE4, AVX
|
||||
(and soon, AVX2).
|
||||
|
||||
* Portability across operating systems: Microsoft
|
||||
Windows, Mac OS X, and Linux are all supported
|
||||
by ``ispc``.
|
||||
|
||||
* Debugging with standard tools: ``ispc``
|
||||
programs can be debugged with standard debuggers (OS X and
|
||||
Linux only).
|
||||
|
||||
Additional Resources
|
||||
--------------------
|
||||
|
||||
Prebuilt ``ispc`` binaries for Windows, OS X and Linux can be downloaded
|
||||
from the `ispc downloads page <http://ispc.github.com/downloads.html>`_.
|
||||
See also additional
|
||||
`documentation <http://ispc.github.com/documentation.html>`_ and additional
|
||||
`performance information <http://ispc.github.com/perf.html>`_.
|
||||
22
README.txt
22
README.txt
@@ -1,22 +0,0 @@
|
||||
==============================
|
||||
Intel(r) SPMD Program Compiler
|
||||
==============================
|
||||
|
||||
Welcome to the Intel(r) SPMD Program Compiler (ispc)!
|
||||
|
||||
ispc is a new compiler for "single program, multiple data" (SPMD)
|
||||
programs. Under the SPMD model, the programmer writes a program that mostly
|
||||
appears to be a regular serial program, though the execution model is
|
||||
actually that a number of program instances execute in parallel on the
|
||||
hardware. ispc compiles a C-based SPMD programming language to run on the
|
||||
SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs
|
||||
with 4-wide SSE units, without any of the difficulty of writing intrinsics
|
||||
code.
|
||||
|
||||
ispc is an open source compiler under the BSD license; see the file
|
||||
LICENSE.txt. ispc supports Windows, Mac, and Linux, with both x86 and
|
||||
x86-64 targets. It currently supports the SSE2, SSE4, and AVX instruction
|
||||
sets.
|
||||
|
||||
For more information and examples, as well as a wiki and the bug database,
|
||||
see the ispc distribution site, http://ispc.github.com.
|
||||
430
ast.cpp
430
ast.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,16 +28,21 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file ast.cpp
|
||||
@brief
|
||||
*/
|
||||
|
||||
@brief General functionality related to abstract syntax trees and
|
||||
traversal of them.
|
||||
*/
|
||||
|
||||
#include "ast.h"
|
||||
#include "expr.h"
|
||||
#include "func.h"
|
||||
#include "stmt.h"
|
||||
#include "sym.h"
|
||||
#include "util.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// ASTNode
|
||||
@@ -50,10 +55,10 @@ ASTNode::~ASTNode() {
|
||||
// AST
|
||||
|
||||
void
|
||||
AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
|
||||
AST::AddFunction(Symbol *sym, Stmt *code) {
|
||||
if (sym == NULL)
|
||||
return;
|
||||
functions.push_back(new Function(sym, args, code));
|
||||
functions.push_back(new Function(sym, code));
|
||||
}
|
||||
|
||||
|
||||
@@ -63,3 +68,416 @@ AST::GenerateIR() {
|
||||
functions[i]->GenerateIR();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
ASTNode *
|
||||
WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
void *data) {
|
||||
if (node == NULL)
|
||||
return node;
|
||||
|
||||
// Call the callback function
|
||||
if (preFunc != NULL) {
|
||||
if (preFunc(node, data) == false)
|
||||
// The function asked us to not continue recursively, so stop.
|
||||
return node;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Handle Statements
|
||||
if (dynamic_cast<Stmt *>(node) != NULL) {
|
||||
ExprStmt *es;
|
||||
DeclStmt *ds;
|
||||
IfStmt *is;
|
||||
DoStmt *dos;
|
||||
ForStmt *fs;
|
||||
ForeachStmt *fes;
|
||||
ForeachActiveStmt *fas;
|
||||
ForeachUniqueStmt *fus;
|
||||
CaseStmt *cs;
|
||||
DefaultStmt *defs;
|
||||
SwitchStmt *ss;
|
||||
ReturnStmt *rs;
|
||||
LabeledStmt *ls;
|
||||
StmtList *sl;
|
||||
PrintStmt *ps;
|
||||
AssertStmt *as;
|
||||
DeleteStmt *dels;
|
||||
UnmaskedStmt *ums;
|
||||
|
||||
if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
|
||||
es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
|
||||
else if ((ds = dynamic_cast<DeclStmt *>(node)) != NULL) {
|
||||
for (unsigned int i = 0; i < ds->vars.size(); ++i)
|
||||
ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((is = dynamic_cast<IfStmt *>(node)) != NULL) {
|
||||
is->test = (Expr *)WalkAST(is->test, preFunc, postFunc, data);
|
||||
is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc,
|
||||
postFunc, data);
|
||||
is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((dos = dynamic_cast<DoStmt *>(node)) != NULL) {
|
||||
dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc,
|
||||
postFunc, data);
|
||||
dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((fs = dynamic_cast<ForStmt *>(node)) != NULL) {
|
||||
fs->init = (Stmt *)WalkAST(fs->init, preFunc, postFunc, data);
|
||||
fs->test = (Expr *)WalkAST(fs->test, preFunc, postFunc, data);
|
||||
fs->step = (Stmt *)WalkAST(fs->step, preFunc, postFunc, data);
|
||||
fs->stmts = (Stmt *)WalkAST(fs->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((fes = dynamic_cast<ForeachStmt *>(node)) != NULL) {
|
||||
for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
|
||||
fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc,
|
||||
postFunc, data);
|
||||
for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
|
||||
fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc,
|
||||
postFunc, data);
|
||||
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
|
||||
fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
|
||||
fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
|
||||
fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
|
||||
cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
|
||||
else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
|
||||
defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
|
||||
else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
|
||||
ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
|
||||
ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if (dynamic_cast<BreakStmt *>(node) != NULL ||
|
||||
dynamic_cast<ContinueStmt *>(node) != NULL ||
|
||||
dynamic_cast<GotoStmt *>(node) != NULL) {
|
||||
// nothing
|
||||
}
|
||||
else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
|
||||
ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
|
||||
else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
|
||||
rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
|
||||
else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
|
||||
std::vector<Stmt *> &sls = sl->stmts;
|
||||
for (unsigned int i = 0; i < sls.size(); ++i)
|
||||
sls[i] = (Stmt *)WalkAST(sls[i], preFunc, postFunc, data);
|
||||
}
|
||||
else if ((ps = dynamic_cast<PrintStmt *>(node)) != NULL)
|
||||
ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
|
||||
else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
|
||||
as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
|
||||
else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
|
||||
dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
|
||||
else if ((ums = dynamic_cast<UnmaskedStmt *>(node)) != NULL)
|
||||
ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
|
||||
else
|
||||
FATAL("Unhandled statement type in WalkAST()");
|
||||
}
|
||||
else {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Handle expressions
|
||||
Assert(dynamic_cast<Expr *>(node) != NULL);
|
||||
UnaryExpr *ue;
|
||||
BinaryExpr *be;
|
||||
AssignExpr *ae;
|
||||
SelectExpr *se;
|
||||
ExprList *el;
|
||||
FunctionCallExpr *fce;
|
||||
IndexExpr *ie;
|
||||
MemberExpr *me;
|
||||
TypeCastExpr *tce;
|
||||
ReferenceExpr *re;
|
||||
PtrDerefExpr *ptrderef;
|
||||
RefDerefExpr *refderef;
|
||||
SizeOfExpr *soe;
|
||||
AddressOfExpr *aoe;
|
||||
NewExpr *newe;
|
||||
|
||||
if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
|
||||
ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
|
||||
else if ((be = dynamic_cast<BinaryExpr *>(node)) != NULL) {
|
||||
be->arg0 = (Expr *)WalkAST(be->arg0, preFunc, postFunc, data);
|
||||
be->arg1 = (Expr *)WalkAST(be->arg1, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((ae = dynamic_cast<AssignExpr *>(node)) != NULL) {
|
||||
ae->lvalue = (Expr *)WalkAST(ae->lvalue, preFunc, postFunc, data);
|
||||
ae->rvalue = (Expr *)WalkAST(ae->rvalue, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((se = dynamic_cast<SelectExpr *>(node)) != NULL) {
|
||||
se->test = (Expr *)WalkAST(se->test, preFunc, postFunc, data);
|
||||
se->expr1 = (Expr *)WalkAST(se->expr1, preFunc, postFunc, data);
|
||||
se->expr2 = (Expr *)WalkAST(se->expr2, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((el = dynamic_cast<ExprList *>(node)) != NULL) {
|
||||
for (unsigned int i = 0; i < el->exprs.size(); ++i)
|
||||
el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
|
||||
fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
|
||||
fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
|
||||
fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL) {
|
||||
ie->baseExpr = (Expr *)WalkAST(ie->baseExpr, preFunc, postFunc, data);
|
||||
ie->index = (Expr *)WalkAST(ie->index, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((me = dynamic_cast<MemberExpr *>(node)) != NULL)
|
||||
me->expr = (Expr *)WalkAST(me->expr, preFunc, postFunc, data);
|
||||
else if ((tce = dynamic_cast<TypeCastExpr *>(node)) != NULL)
|
||||
tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
|
||||
else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
|
||||
re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
|
||||
else if ((ptrderef = dynamic_cast<PtrDerefExpr *>(node)) != NULL)
|
||||
ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
|
||||
data);
|
||||
else if ((refderef = dynamic_cast<RefDerefExpr *>(node)) != NULL)
|
||||
refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
|
||||
data);
|
||||
else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
|
||||
soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
|
||||
else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
|
||||
aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
|
||||
else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
|
||||
newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc,
|
||||
postFunc, data);
|
||||
newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
|
||||
dynamic_cast<ConstExpr *>(node) != NULL ||
|
||||
dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
|
||||
dynamic_cast<SyncExpr *>(node) != NULL ||
|
||||
dynamic_cast<NullPointerExpr *>(node) != NULL) {
|
||||
// nothing to do
|
||||
}
|
||||
else
|
||||
FATAL("Unhandled expression type in WalkAST().");
|
||||
}
|
||||
|
||||
// Call the callback function
|
||||
if (postFunc != NULL)
|
||||
return postFunc(node, data);
|
||||
else
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
static ASTNode *
|
||||
lOptimizeNode(ASTNode *node, void *) {
|
||||
return node->Optimize();
|
||||
}
|
||||
|
||||
|
||||
ASTNode *
|
||||
Optimize(ASTNode *root) {
|
||||
return WalkAST(root, NULL, lOptimizeNode, NULL);
|
||||
}
|
||||
|
||||
|
||||
Expr *
|
||||
Optimize(Expr *expr) {
|
||||
return (Expr *)Optimize((ASTNode *)expr);
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
Optimize(Stmt *stmt) {
|
||||
return (Stmt *)Optimize((ASTNode *)stmt);
|
||||
}
|
||||
|
||||
|
||||
static ASTNode *
|
||||
lTypeCheckNode(ASTNode *node, void *) {
|
||||
return node->TypeCheck();
|
||||
}
|
||||
|
||||
|
||||
ASTNode *
|
||||
TypeCheck(ASTNode *root) {
|
||||
return WalkAST(root, NULL, lTypeCheckNode, NULL);
|
||||
}
|
||||
|
||||
|
||||
Expr *
|
||||
TypeCheck(Expr *expr) {
|
||||
return (Expr *)TypeCheck((ASTNode *)expr);
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
TypeCheck(Stmt *stmt) {
|
||||
return (Stmt *)TypeCheck((ASTNode *)stmt);
|
||||
}
|
||||
|
||||
|
||||
struct CostData {
|
||||
CostData() { cost = foreachDepth = 0; }
|
||||
|
||||
int cost;
|
||||
int foreachDepth;
|
||||
};
|
||||
|
||||
|
||||
static bool
|
||||
lCostCallbackPre(ASTNode *node, void *d) {
|
||||
CostData *data = (CostData *)d;
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL)
|
||||
++data->foreachDepth;
|
||||
if (data->foreachDepth == 0)
|
||||
data->cost += node->EstimateCost();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static ASTNode *
|
||||
lCostCallbackPost(ASTNode *node, void *d) {
|
||||
CostData *data = (CostData *)d;
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL)
|
||||
--data->foreachDepth;
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
EstimateCost(ASTNode *root) {
|
||||
CostData data;
|
||||
WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
|
||||
return data.cost;
|
||||
}
|
||||
|
||||
|
||||
/** Given an AST node, check to see if it's safe if we happen to run the
|
||||
code for that node with the execution mask all off.
|
||||
*/
|
||||
static bool
|
||||
lCheckAllOffSafety(ASTNode *node, void *data) {
|
||||
bool *okPtr = (bool *)data;
|
||||
|
||||
FunctionCallExpr *fce;
|
||||
if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
|
||||
if (fce->func == NULL)
|
||||
return false;
|
||||
|
||||
const Type *type = fce->func->GetType();
|
||||
const PointerType *pt = CastType<PointerType>(type);
|
||||
if (pt != NULL)
|
||||
type = pt->GetBaseType();
|
||||
const FunctionType *ftype = CastType<FunctionType>(type);
|
||||
Assert(ftype != NULL);
|
||||
|
||||
if (ftype->isSafe == false) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dynamic_cast<AssertStmt *>(node) != NULL) {
|
||||
// While it's fine to run the assert for varying tests, it's not
|
||||
// desirable to check an assert on a uniform variable if all of the
|
||||
// lanes are off.
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dynamic_cast<NewExpr *>(node) != NULL ||
|
||||
dynamic_cast<DeleteStmt *>(node) != NULL) {
|
||||
// We definitely don't want to run the uniform variants of these if
|
||||
// the mask is all off. It's also worth skipping the overhead of
|
||||
// executing the varying versions of them in the all-off mask case.
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL ||
|
||||
dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
|
||||
dynamic_cast<ForeachUniqueStmt *>(node) != NULL ||
|
||||
dynamic_cast<UnmaskedStmt *>(node) != NULL) {
|
||||
// The various foreach statements also shouldn't be run with an
|
||||
// all-off mask. Since they can re-establish an 'all on' mask,
|
||||
// this would be pretty unintuitive. (More generally, it's
|
||||
// possibly a little strange to allow foreach in the presence of
|
||||
// any non-uniform control flow...)
|
||||
//
|
||||
// Similarly, the implementation of foreach_unique assumes as a
|
||||
// precondition that the mask won't be all off going into it, so
|
||||
// we'll enforce that here...
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
IndexExpr *ie;
|
||||
if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
|
||||
const Type *type = ie->baseExpr->GetType();
|
||||
if (type == NULL)
|
||||
return true;
|
||||
if (CastType<ReferenceType>(type) != NULL)
|
||||
type = type->GetReferenceTarget();
|
||||
|
||||
ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
|
||||
if (ce == NULL) {
|
||||
// indexing with a variable... -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
const PointerType *pointerType = CastType<PointerType>(type);
|
||||
if (pointerType != NULL) {
|
||||
// pointer[index] -> can't be sure -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
const SequentialType *seqType = CastType<SequentialType>(type);
|
||||
Assert(seqType != NULL);
|
||||
int nElements = seqType->GetElementCount();
|
||||
if (nElements == 0) {
|
||||
// Unsized array, so we can't be sure -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t indices[ISPC_MAX_NVEC];
|
||||
int count = ce->AsInt32(indices);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
if (indices[i] < 0 || indices[i] >= nElements) {
|
||||
// Index is out of bounds -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// All indices are in-bounds
|
||||
return true;
|
||||
}
|
||||
|
||||
MemberExpr *me;
|
||||
if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
|
||||
me->dereferenceExpr) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dynamic_cast<PtrDerefExpr *>(node) != NULL) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
SafeToRunWithMaskAllOff(ASTNode *root) {
|
||||
bool safe = true;
|
||||
WalkAST(root, lCheckAllOffSafety, NULL, &safe);
|
||||
return safe;
|
||||
}
|
||||
|
||||
74
ast.h
74
ast.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,11 +28,11 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file ast.h
|
||||
@brief
|
||||
@brief
|
||||
*/
|
||||
|
||||
#ifndef ISPC_AST_H
|
||||
@@ -53,10 +53,11 @@ public:
|
||||
virtual ~ASTNode();
|
||||
|
||||
/** The Optimize() method should perform any appropriate early-stage
|
||||
optimizations on the node (e.g. constant folding). The caller
|
||||
should use the returned ASTNode * in place of the original node.
|
||||
This method may return NULL if an error is encountered during
|
||||
optimization. */
|
||||
optimizations on the node (e.g. constant folding). This method
|
||||
will be called after the node's children have already been
|
||||
optimized, and the caller will store the returned ASTNode * in
|
||||
place of the original node. This method should return NULL if an
|
||||
error is encountered during optimization. */
|
||||
virtual ASTNode *Optimize() = 0;
|
||||
|
||||
/** Type checking should be performed by the node when this method is
|
||||
@@ -65,6 +66,9 @@ public:
|
||||
pointer in place of the original ASTNode *. */
|
||||
virtual ASTNode *TypeCheck() = 0;
|
||||
|
||||
/** Estimate the execution cost of the node (not including the cost of
|
||||
the children. The value returned should be based on the COST_*
|
||||
enumerant values defined in ispc.h. */
|
||||
virtual int EstimateCost() const = 0;
|
||||
|
||||
/** All AST nodes must track the file position where they are
|
||||
@@ -80,8 +84,7 @@ class AST {
|
||||
public:
|
||||
/** Add the AST for a function described by the given declaration
|
||||
information and source code. */
|
||||
void AddFunction(Symbol *sym, const std::vector<Symbol *> &args,
|
||||
Stmt *code);
|
||||
void AddFunction(Symbol *sym, Stmt *code);
|
||||
|
||||
/** Generate LLVM IR for all of the functions into the current
|
||||
module. */
|
||||
@@ -91,4 +94,57 @@ private:
|
||||
std::vector<Function *> functions;
|
||||
};
|
||||
|
||||
|
||||
/** Callback function type for preorder traversial visiting function for
|
||||
the AST walk.
|
||||
*/
|
||||
typedef bool (* ASTPreCallBackFunc)(ASTNode *node, void *data);
|
||||
|
||||
/** Callback function type for postorder traversial visiting function for
|
||||
the AST walk.
|
||||
*/
|
||||
typedef ASTNode * (* ASTPostCallBackFunc)(ASTNode *node, void *data);
|
||||
|
||||
/** Walk (some portion of) an AST, starting from the given root node. At
|
||||
each node, if preFunc is non-NULL, call it, passing the given void
|
||||
*data pointer; if the call to preFunc function returns false, then the
|
||||
children of the node aren't visited. This function then makes
|
||||
recursive calls to WalkAST() to process the node's children; after
|
||||
doing so, calls postFunc, at the node. The return value from the
|
||||
postFunc call is ignored. */
|
||||
extern ASTNode *WalkAST(ASTNode *root, ASTPreCallBackFunc preFunc,
|
||||
ASTPostCallBackFunc postFunc, void *data);
|
||||
|
||||
/** Perform simple optimizations on the AST or portion thereof passed to
|
||||
this function, returning the resulting AST. */
|
||||
extern ASTNode *Optimize(ASTNode *root);
|
||||
|
||||
/** Convenience version of Optimize() for Expr *s that returns an Expr *
|
||||
(rather than an ASTNode *, which would require the caller to cast back
|
||||
to an Expr *). */
|
||||
extern Expr *Optimize(Expr *);
|
||||
|
||||
/** Convenience version of Optimize() for Expr *s that returns an Stmt *
|
||||
(rather than an ASTNode *, which would require the caller to cast back
|
||||
to a Stmt *). */
|
||||
extern Stmt *Optimize(Stmt *);
|
||||
|
||||
/** Perform type-checking on the given AST (or portion of one), returning a
|
||||
pointer to the root of the resulting AST. */
|
||||
extern ASTNode *TypeCheck(ASTNode *root);
|
||||
|
||||
/** Convenience version of TypeCheck() for Expr *s that returns an Expr *. */
|
||||
extern Expr *TypeCheck(Expr *);
|
||||
|
||||
/** Convenience version of TypeCheck() for Stmt *s that returns an Stmt *. */
|
||||
extern Stmt *TypeCheck(Stmt *);
|
||||
|
||||
/** Returns an estimate of the execution cost of the tree starting at
|
||||
the given root. */
|
||||
extern int EstimateCost(ASTNode *root);
|
||||
|
||||
/** Returns true if it would be safe to run the given code with an "all
|
||||
off" mask. */
|
||||
extern bool SafeToRunWithMaskAllOff(ASTNode *root);
|
||||
|
||||
#endif // ISPC_AST_H
|
||||
|
||||
@@ -10,8 +10,13 @@ import os
|
||||
length=0
|
||||
|
||||
src=str(sys.argv[1])
|
||||
if (len(sys.argv) > 2):
|
||||
runtime=str(sys.argv[2])
|
||||
|
||||
target = re.sub(".*builtins-", "", src)
|
||||
target = re.sub("builtins/target-", "", src)
|
||||
target = re.sub(r"builtins\\target-", "", target)
|
||||
target = re.sub("builtins/", "", target)
|
||||
target = re.sub(r"builtins\\", "", target)
|
||||
target = re.sub("\.ll$", "", target)
|
||||
target = re.sub("\.c$", "", target)
|
||||
target = re.sub("-", "_", target)
|
||||
@@ -23,17 +28,24 @@ if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT")
|
||||
try:
|
||||
as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||
except IOError:
|
||||
print >> sys.stderr, "Couldn't open " + src
|
||||
sys.stderr.write("Couldn't open " + src)
|
||||
sys.exit(1)
|
||||
|
||||
print "unsigned char builtins_bitcode_" + target + "[] = {"
|
||||
for line in as_out.stdout.readlines():
|
||||
length = length + len(line)
|
||||
for c in line:
|
||||
print ord(c)
|
||||
print ", "
|
||||
print " 0 };\n\n"
|
||||
print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
|
||||
name = target
|
||||
if (len(sys.argv) > 2):
|
||||
name += "_" + runtime;
|
||||
width = 16;
|
||||
sys.stdout.write("unsigned char builtins_bitcode_" + name + "[] = {\n")
|
||||
|
||||
data = as_out.stdout.read()
|
||||
for i in range(0, len(data), 1):
|
||||
sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
|
||||
|
||||
if i%width == (width-1):
|
||||
sys.stdout.write("\n")
|
||||
|
||||
sys.stdout.write("0x00 };\n\n")
|
||||
sys.stdout.write("int builtins_bitcode_" + name + "_length = " + str(i+1) + ";\n")
|
||||
|
||||
as_out.wait()
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ REM Both the LLVM binaries and python need to be in the path
|
||||
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
|
||||
|
||||
msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
|
||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
|
||||
|
||||
11
buildispc.bat
Normal file
11
buildispc.bat
Normal file
@@ -0,0 +1,11 @@
|
||||
@echo off
|
||||
|
||||
REM If LLVM_INSTALL_DIR isn't set globally in your environment,
|
||||
REM it can be set here_
|
||||
REM set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
|
||||
REM set LLVM_VERSION=LLVM_3_2
|
||||
|
||||
REM Both the LLVM binaries and python need to be in the path
|
||||
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
|
||||
|
||||
msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
482
builtins.cpp
482
builtins.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2013, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,11 +28,11 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file builtins.cpp
|
||||
@brief Definitions of functions related to setting up the standard library
|
||||
@brief Definitions of functions related to setting up the standard library
|
||||
and other builtins.
|
||||
*/
|
||||
|
||||
@@ -47,12 +47,25 @@
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#if defined(LLVM_3_2)
|
||||
#include <llvm/Attributes.h>
|
||||
#endif
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#else
|
||||
#include <llvm/IR/Attributes.h>
|
||||
#include <llvm/IR/LLVMContext.h>
|
||||
#include <llvm/IR/Module.h>
|
||||
#include <llvm/IR/Type.h>
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#include <llvm/IR/Intrinsics.h>
|
||||
#include <llvm/IR/DerivedTypes.h>
|
||||
#endif
|
||||
#include <llvm/Linker.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/ADT/Triple.h>
|
||||
@@ -99,6 +112,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
|
||||
|
||||
// varying
|
||||
if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
|
||||
t == LLVMTypes::MaskType)
|
||||
return AtomicType::VaryingBool;
|
||||
else if (t == LLVMTypes::Int8VectorType)
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
|
||||
else if (t == LLVMTypes::Int16VectorType)
|
||||
@@ -153,9 +169,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
|
||||
|
||||
static void
|
||||
lCreateSymbol(const std::string &name, const Type *returnType,
|
||||
const std::vector<const Type *> &argTypes,
|
||||
const llvm::FunctionType *ftype, llvm::Function *func,
|
||||
lCreateSymbol(const std::string &name, const Type *returnType,
|
||||
llvm::SmallVector<const Type *, 8> &argTypes,
|
||||
const llvm::FunctionType *ftype, llvm::Function *func,
|
||||
SymbolTable *symbolTable) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
@@ -194,9 +210,9 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
// symbol creation code below assumes that any LLVM vector of i32s is a
|
||||
// varying int32. Here, we need that to be interpreted as a varying
|
||||
// bool, so just have a one-off override for that one...
|
||||
if (name == "__sext_varying_bool") {
|
||||
if (g->target->getMaskBitCount() != 1 && name == "__sext_varying_bool") {
|
||||
const Type *returnType = AtomicType::VaryingInt32;
|
||||
std::vector<const Type *> argTypes;
|
||||
llvm::SmallVector<const Type *, 8> argTypes;
|
||||
argTypes.push_back(AtomicType::VaryingBool);
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
@@ -226,7 +242,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
// Iterate over the arguments and try to find their equivalent ispc
|
||||
// types. Track if any of the arguments has an integer type.
|
||||
bool anyIntArgs = false;
|
||||
std::vector<const Type *> argTypes;
|
||||
llvm::SmallVector<const Type *, 8> argTypes;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
|
||||
const llvm::Type *llvmArgType = ftype->getParamType(j);
|
||||
const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
|
||||
@@ -235,7 +251,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
"representable for builtin %s", j, name.c_str());
|
||||
return false;
|
||||
}
|
||||
anyIntArgs |=
|
||||
anyIntArgs |=
|
||||
(Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
|
||||
argTypes.push_back(type);
|
||||
}
|
||||
@@ -257,7 +273,7 @@ static void
|
||||
lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
#if 0
|
||||
// FIXME: handle globals?
|
||||
assert(module->global_empty());
|
||||
Assert(module->global_empty());
|
||||
#endif
|
||||
|
||||
llvm::Module::iterator iter;
|
||||
@@ -270,7 +286,7 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
|
||||
/** In many of the builtins-*.ll files, we have declarations of various LLVM
|
||||
intrinsics that are then used in the implementation of various target-
|
||||
specific functions. This function loops over all of the intrinsic
|
||||
specific functions. This function loops over all of the intrinsic
|
||||
declarations and makes sure that the signature we have in our .ll file
|
||||
matches the signature of the actual intrinsic.
|
||||
*/
|
||||
@@ -287,11 +303,11 @@ lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
// check the llvm.x86.* intrinsics for now...
|
||||
if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
|
||||
llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
|
||||
assert(id != 0);
|
||||
LLVM_TYPE_CONST llvm::Type *intrinsicType =
|
||||
Assert(id != 0);
|
||||
llvm::Type *intrinsicType =
|
||||
llvm::Intrinsic::getType(*g->ctx, id);
|
||||
intrinsicType = llvm::PointerType::get(intrinsicType, 0);
|
||||
assert(func->getType() == intrinsicType);
|
||||
Assert(func->getType() == intrinsicType);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -311,10 +327,16 @@ lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
static void
|
||||
lSetInternalFunctions(llvm::Module *module) {
|
||||
const char *names[] = {
|
||||
"__add_float",
|
||||
"__add_int32",
|
||||
"__add_uniform_double",
|
||||
"__add_uniform_int32",
|
||||
"__add_uniform_int64",
|
||||
"__add_varying_double",
|
||||
"__add_varying_int32",
|
||||
"__add_varying_int64",
|
||||
"__all",
|
||||
"__any",
|
||||
"__aos_to_soa3_float",
|
||||
"__aos_to_soa3_float16",
|
||||
"__aos_to_soa3_float4",
|
||||
@@ -371,21 +393,26 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
"__broadcast_double",
|
||||
"__broadcast_float",
|
||||
"__broadcast_int16",
|
||||
"__broadcast_int32",
|
||||
"__broadcast_int64",
|
||||
"__broadcast_int8",
|
||||
"__broadcast_i16",
|
||||
"__broadcast_i32",
|
||||
"__broadcast_i64",
|
||||
"__broadcast_i8",
|
||||
"__ceil_uniform_double",
|
||||
"__ceil_uniform_float",
|
||||
"__ceil_varying_double",
|
||||
"__ceil_varying_float",
|
||||
"__clock",
|
||||
"__count_trailing_zeros_i32",
|
||||
"__count_trailing_zeros_i64",
|
||||
"__count_leading_zeros_i32",
|
||||
"__count_leading_zeros_i64",
|
||||
"__delete_uniform_32rt",
|
||||
"__delete_uniform_64rt",
|
||||
"__delete_varying_32rt",
|
||||
"__delete_varying_64rt",
|
||||
"__do_assert_uniform",
|
||||
"__do_assert_varying",
|
||||
"__do_print",
|
||||
"__do_print",
|
||||
"__doublebits_uniform_int64",
|
||||
"__doublebits_varying_int64",
|
||||
"__exclusive_scan_add_double",
|
||||
@@ -401,12 +428,17 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__extract_int64",
|
||||
"__extract_int8",
|
||||
"__fastmath",
|
||||
"__float_to_half_uniform",
|
||||
"__float_to_half_varying",
|
||||
"__floatbits_uniform_int32",
|
||||
"__floatbits_varying_int32",
|
||||
"__floor_uniform_double",
|
||||
"__floor_uniform_float",
|
||||
"__floor_varying_double",
|
||||
"__floor_varying_float",
|
||||
"__get_system_isa",
|
||||
"__half_to_float_uniform",
|
||||
"__half_to_float_varying",
|
||||
"__insert_int16",
|
||||
"__insert_int32",
|
||||
"__insert_int64",
|
||||
@@ -428,6 +460,12 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__max_varying_uint32",
|
||||
"__max_varying_uint64",
|
||||
"__memory_barrier",
|
||||
"__memcpy32",
|
||||
"__memcpy64",
|
||||
"__memmove32",
|
||||
"__memmove64",
|
||||
"__memset32",
|
||||
"__memset64",
|
||||
"__min_uniform_double",
|
||||
"__min_uniform_float",
|
||||
"__min_uniform_int32",
|
||||
@@ -441,9 +479,16 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__min_varying_uint32",
|
||||
"__min_varying_uint64",
|
||||
"__movmsk",
|
||||
"__new_uniform_32rt",
|
||||
"__new_uniform_64rt",
|
||||
"__new_varying32_32rt",
|
||||
"__new_varying32_64rt",
|
||||
"__new_varying64_64rt",
|
||||
"__none",
|
||||
"__num_cores",
|
||||
"__packed_load_active",
|
||||
"__packed_store_active",
|
||||
"__pause",
|
||||
"__popcnt_int32",
|
||||
"__popcnt_int64",
|
||||
"__prefetch_read_uniform_1",
|
||||
@@ -452,12 +497,13 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__prefetch_read_uniform_nt",
|
||||
"__rcp_uniform_float",
|
||||
"__rcp_varying_float",
|
||||
"__rdrand_i16",
|
||||
"__rdrand_i32",
|
||||
"__rdrand_i64",
|
||||
"__reduce_add_double",
|
||||
"__reduce_add_float",
|
||||
"__reduce_add_int32",
|
||||
"__reduce_add_int64",
|
||||
"__reduce_add_uint32",
|
||||
"__reduce_add_uint64",
|
||||
"__reduce_equal_double",
|
||||
"__reduce_equal_float",
|
||||
"__reduce_equal_int32",
|
||||
@@ -476,30 +522,31 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__reduce_min_uint64",
|
||||
"__rotate_double",
|
||||
"__rotate_float",
|
||||
"__rotate_int16",
|
||||
"__rotate_int32",
|
||||
"__rotate_int64",
|
||||
"__rotate_int8",
|
||||
"__rotate_i16",
|
||||
"__rotate_i32",
|
||||
"__rotate_i64",
|
||||
"__rotate_i8",
|
||||
"__round_uniform_double",
|
||||
"__round_uniform_float",
|
||||
"__round_varying_double",
|
||||
"__round_varying_float",
|
||||
"__rsqrt_uniform_float",
|
||||
"__rsqrt_varying_float",
|
||||
"__set_system_isa",
|
||||
"__sext_uniform_bool",
|
||||
"__sext_varying_bool",
|
||||
"__shuffle2_double",
|
||||
"__shuffle2_float",
|
||||
"__shuffle2_int16",
|
||||
"__shuffle2_int32",
|
||||
"__shuffle2_int64",
|
||||
"__shuffle2_int8",
|
||||
"__shuffle2_i16",
|
||||
"__shuffle2_i32",
|
||||
"__shuffle2_i64",
|
||||
"__shuffle2_i8",
|
||||
"__shuffle_double",
|
||||
"__shuffle_float",
|
||||
"__shuffle_int16",
|
||||
"__shuffle_int32",
|
||||
"__shuffle_int64",
|
||||
"__shuffle_int8",
|
||||
"__shuffle_i16",
|
||||
"__shuffle_i32",
|
||||
"__shuffle_i64",
|
||||
"__shuffle_i8",
|
||||
"__soa_to_aos3_float",
|
||||
"__soa_to_aos3_float16",
|
||||
"__soa_to_aos3_float4",
|
||||
@@ -514,6 +561,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__sqrt_uniform_float",
|
||||
"__sqrt_varying_double",
|
||||
"__sqrt_varying_float",
|
||||
"__stdlib_acosf",
|
||||
"__stdlib_asinf",
|
||||
"__stdlib_atan",
|
||||
"__stdlib_atan2",
|
||||
"__stdlib_atan2f",
|
||||
@@ -543,13 +592,19 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__svml_pow",
|
||||
"__undef_uniform",
|
||||
"__undef_varying",
|
||||
"__vec4_add_float",
|
||||
"__vec4_add_int32",
|
||||
"__vselect_float",
|
||||
"__vselect_i32",
|
||||
};
|
||||
|
||||
int count = sizeof(names) / sizeof(names[0]);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
llvm::Function *f = module->getFunction(names[i]);
|
||||
if (f != NULL)
|
||||
if (f != NULL && f->empty() == false) {
|
||||
f->setLinkage(llvm::GlobalValue::InternalLinkage);
|
||||
g->target->markFuncWithTargetAttr(f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -583,17 +638,31 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
// linking together modules with incompatible target triples..
|
||||
llvm::Triple mTriple(m->module->getTargetTriple());
|
||||
llvm::Triple bcTriple(bcModule->getTargetTriple());
|
||||
assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
||||
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
||||
mTriple.getArch() == bcTriple.getArch());
|
||||
assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
|
||||
Assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
|
||||
mTriple.getVendor() == bcTriple.getVendor());
|
||||
bcModule->setTargetTriple(mTriple.str());
|
||||
|
||||
// We unconditionally set module DataLayout to library, but we must
|
||||
// ensure that library and module DataLayouts are compatible.
|
||||
// If they are not, we should recompile the library for problematic
|
||||
// architecture and investigate what happened.
|
||||
// Generally we allow library DataLayout to be subset of module
|
||||
// DataLayout or library DataLayout to be empty.
|
||||
if (!VerifyDataLayoutCompatibility(module->getDataLayout(),
|
||||
bcModule->getDataLayout())) {
|
||||
Error(SourcePos(), "Module DataLayout is incompatible with library DataLayout:\n"
|
||||
"Module DL: %s\n"
|
||||
"Library DL: %s\n",
|
||||
module->getDataLayout().c_str(), bcModule->getDataLayout().c_str());
|
||||
}
|
||||
|
||||
bcModule->setDataLayout(module->getDataLayout());
|
||||
|
||||
std::string(linkError);
|
||||
if (llvm::Linker::LinkModules(module, bcModule,
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
if (llvm::Linker::LinkModules(module, bcModule,
|
||||
llvm::Linker::DestroySource,
|
||||
#endif // LLVM_3_0
|
||||
&linkError))
|
||||
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
||||
lSetInternalFunctions(module);
|
||||
@@ -610,15 +679,37 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
static void
|
||||
lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
|
||||
SC_STATIC);
|
||||
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
Symbol *sym =
|
||||
new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
|
||||
SC_STATIC);
|
||||
sym->constValue = new ConstExpr(sym->type, val, SourcePos());
|
||||
llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
llvm::Constant *linit = LLVMInt32(val);
|
||||
pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
linit, pw->name.c_str());
|
||||
symbolTable->AddVariable(pw);
|
||||
// Use WeakODRLinkage rather than InternalLinkage so that a definition
|
||||
// survives even if it's not used in the module, so that the symbol is
|
||||
// there in the debugger.
|
||||
llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
|
||||
llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
|
||||
sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
|
||||
linit, name);
|
||||
symbolTable->AddVariable(sym);
|
||||
|
||||
if (m->diBuilder != NULL) {
|
||||
llvm::DIFile file;
|
||||
llvm::DIType diType = sym->type->GetDIType(file);
|
||||
Assert(diType.Verify());
|
||||
// FIXME? DWARF says that this (and programIndex below) should
|
||||
// have the DW_AT_artifical attribute. It's not clear if this
|
||||
// matters for anything though.
|
||||
llvm::DIGlobalVariable var =
|
||||
m->diBuilder->createGlobalVariable(name,
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
true /* static */,
|
||||
sym->storagePtr);
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -626,13 +717,17 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
static void
|
||||
lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
std::vector<const Type *> args;
|
||||
llvm::SmallVector<const Type *, 8> args;
|
||||
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||
Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
|
||||
|
||||
llvm::Function *func = module->getFunction(name);
|
||||
assert(func != NULL); // it should be declared already...
|
||||
Assert(func != NULL); // it should be declared already...
|
||||
#if defined(LLVM_3_2)
|
||||
func->addFnAttr(llvm::Attributes::AlwaysInline);
|
||||
#else // LLVM 3.1 and 3.3+
|
||||
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
#endif
|
||||
llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
|
||||
llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
|
||||
|
||||
@@ -644,103 +739,238 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
|
||||
static void
|
||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||
AtomicType::VaryingConstInt32, SC_STATIC);
|
||||
Symbol *sym =
|
||||
new Symbol("programIndex", SourcePos(),
|
||||
AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);
|
||||
|
||||
int pi[ISPC_MAX_NVEC];
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
for (int i = 0; i < g->target->getVectorWidth(); ++i)
|
||||
pi[i] = i;
|
||||
pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
|
||||
sym->constValue = new ConstExpr(sym->type, pi, SourcePos());
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
llvm::Constant *linit = LLVMInt32Vector(pi);
|
||||
pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage, linit,
|
||||
pidx->name.c_str());
|
||||
symbolTable->AddVariable(pidx);
|
||||
// See comment in lDefineConstantInt() for why WeakODRLinkage is used here
|
||||
llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
|
||||
llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
|
||||
sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
|
||||
linit, sym->name.c_str());
|
||||
symbolTable->AddVariable(sym);
|
||||
|
||||
if (m->diBuilder != NULL) {
|
||||
llvm::DIFile file;
|
||||
llvm::DIType diType = sym->type->GetDIType(file);
|
||||
Assert(diType.Verify());
|
||||
llvm::DIGlobalVariable var =
|
||||
m->diBuilder->createGlobalVariable(sym->name.c_str(),
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
false /* static */,
|
||||
sym->storagePtr);
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||
bool includeStdlibISPC) {
|
||||
bool runtime32 = g->target->is32Bit();
|
||||
|
||||
#define EXPORT_MODULE(export_module) \
|
||||
extern unsigned char export_module[]; \
|
||||
extern int export_module##_length; \
|
||||
AddBitcodeToModule(export_module, export_module##_length, \
|
||||
module, symbolTable);
|
||||
|
||||
// Add the definitions from the compiled builtins-c.c file
|
||||
if (g->target.is32Bit) {
|
||||
extern unsigned char builtins_bitcode_c_32[];
|
||||
extern int builtins_bitcode_c_32_length;
|
||||
AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||
module, symbolTable);
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_c_32);
|
||||
}
|
||||
else {
|
||||
extern unsigned char builtins_bitcode_c_64[];
|
||||
extern int builtins_bitcode_c_64_length;
|
||||
AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||
module, symbolTable);
|
||||
EXPORT_MODULE(builtins_bitcode_c_64);
|
||||
}
|
||||
|
||||
// Next, add the target's custom implementations of the various needed
|
||||
// builtin functions (e.g. __masked_store_32(), etc).
|
||||
switch (g->target.isa) {
|
||||
case Target::SSE2:
|
||||
extern unsigned char builtins_bitcode_sse2[];
|
||||
extern int builtins_bitcode_sse2_length;
|
||||
extern unsigned char builtins_bitcode_sse2_x2[];
|
||||
extern int builtins_bitcode_sse2_x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length,
|
||||
module, symbolTable);
|
||||
switch (g->target->getISA()) {
|
||||
case Target::SSE2: {
|
||||
switch (g->target->getVectorWidth()) {
|
||||
case 4:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_sse2_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_sse2_64bit);
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length,
|
||||
module, symbolTable);
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_sse2_x2_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_sse2_x2_64bit);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::SSE4:
|
||||
extern unsigned char builtins_bitcode_sse4[];
|
||||
extern int builtins_bitcode_sse4_length;
|
||||
extern unsigned char builtins_bitcode_sse4_x2[];
|
||||
extern int builtins_bitcode_sse4_x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
case Target::SSE4: {
|
||||
switch (g->target->getVectorWidth()) {
|
||||
case 4:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_sse4_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_sse4_64bit);
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length,
|
||||
module, symbolTable);
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::AVX:
|
||||
switch (g->target.vectorWidth) {
|
||||
}
|
||||
case Target::AVX: {
|
||||
switch (g->target->getVectorWidth()) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx[];
|
||||
extern int builtins_bitcode_avx_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length,
|
||||
module, symbolTable);
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_avx1_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_avx1_64bit);
|
||||
}
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx_x2[];
|
||||
extern int builtins_bitcode_avx_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
module, symbolTable);
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_avx1_x2_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_avx1_x2_64bit);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Target::AVX11: {
|
||||
switch (g->target->getVectorWidth()) {
|
||||
case 8:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_avx11_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_avx11_64bit);
|
||||
}
|
||||
break;
|
||||
case 16:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_avx11_x2_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_avx11_x2_64bit);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Target::AVX2: {
|
||||
switch (g->target->getVectorWidth()) {
|
||||
case 8:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_avx2_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_avx2_64bit);
|
||||
}
|
||||
break;
|
||||
case 16:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_avx2_x2_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_avx2_x2_64bit);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Target::GENERIC: {
|
||||
switch (g->target->getVectorWidth()) {
|
||||
case 4:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_4_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_4_64bit);
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_8_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_8_64bit);
|
||||
}
|
||||
break;
|
||||
case 16:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_16_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_16_64bit);
|
||||
}
|
||||
break;
|
||||
case 32:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_32_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_32_64bit);
|
||||
}
|
||||
break;
|
||||
case 64:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_64_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_64_64bit);
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
if (runtime32) {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_1_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_1_64bit);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
FATAL("logic error");
|
||||
}
|
||||
|
||||
// define the 'programCount' builtin variable
|
||||
lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);
|
||||
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
|
||||
|
||||
// define the 'programIndex' builtin
|
||||
lDefineProgramIndex(module, symbolTable);
|
||||
@@ -750,23 +980,41 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
lDefineConstantInt("__math_lib", (int)g->mathLib, module, symbolTable);
|
||||
lDefineConstantInt("__math_lib_ispc", (int)Globals::Math_ISPC, module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast,
|
||||
lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast,
|
||||
module, symbolTable);
|
||||
lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
||||
symbolTable);
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
|
||||
symbolTable);
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
|
||||
module, symbolTable);
|
||||
|
||||
lDefineConstantInt("__have_native_half", g->target->hasHalf(), module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__have_native_rand", g->target->hasRand(), module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__have_native_transcendentals", g->target->hasTranscendentals(),
|
||||
module, symbolTable);
|
||||
|
||||
if (g->forceAlignment != -1) {
|
||||
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
|
||||
alignment->setInitializer(LLVMInt32(g->forceAlignment));
|
||||
}
|
||||
|
||||
if (includeStdlibISPC) {
|
||||
// If the user wants the standard library to be included, parse the
|
||||
// serialized version of the stdlib.ispc file to get its
|
||||
// definitions added. Disable emission of performance warnings for
|
||||
// now, since the user doesn't care about any of that in the stdlib
|
||||
// implementation...
|
||||
extern char stdlib_code[];
|
||||
yy_scan_string(stdlib_code);
|
||||
yyparse();
|
||||
// definitions added.
|
||||
if (g->target->getISA() == Target::GENERIC &&
|
||||
g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib
|
||||
extern char stdlib_generic_code[];
|
||||
yy_scan_string(stdlib_generic_code);
|
||||
yyparse();
|
||||
}
|
||||
else {
|
||||
extern char stdlib_x86_code[];
|
||||
yy_scan_string(stdlib_x86_code);
|
||||
yyparse();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,11 +28,11 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file builtins.h
|
||||
@brief Declarations of functions related to builtins and the
|
||||
@brief Declarations of functions related to builtins and the
|
||||
standard library
|
||||
*/
|
||||
|
||||
|
||||
2915
builtins.m4
2915
builtins.m4
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file builtins-c.c
|
||||
@@ -50,6 +50,16 @@
|
||||
available to ispc programs at compile time automatically.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// We do want old school sprintf and don't want secure Microsoft extensions.
|
||||
// And we also don't want warnings about it, so the define.
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#else
|
||||
// Some versions of glibc has "fortification" feature, which expands sprintf
|
||||
// to __builtin___sprintf_chk(..., __builtin_object_size(...), ...).
|
||||
// We don't want this kind of expansion, as we don't support these intrinsics.
|
||||
#define _FORTIFY_SOURCE 0
|
||||
#endif
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#include <unistd.h>
|
||||
@@ -59,22 +69,39 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef int Bool;
|
||||
|
||||
#define PRINT_SCALAR(fmt, type) \
|
||||
printf(fmt, *((type *)ptr)); \
|
||||
#define PRINT_BUF_SIZE 4096
|
||||
|
||||
#define APPEND(str) \
|
||||
do { \
|
||||
int offset = bufp - &printString[0]; \
|
||||
*bufp = '\0'; \
|
||||
strncat(bufp, str, PRINT_BUF_SIZE-offset); \
|
||||
bufp += strlen(str); \
|
||||
if (bufp >= &printString[PRINT_BUF_SIZE]) \
|
||||
goto done; \
|
||||
} while (0) /* eat semicolon */
|
||||
|
||||
|
||||
#define PRINT_SCALAR(fmt, type) \
|
||||
sprintf(tmpBuf, fmt, *((type *)ptr)); \
|
||||
APPEND(tmpBuf); \
|
||||
break
|
||||
|
||||
#define PRINT_VECTOR(fmt, type) \
|
||||
putchar('['); \
|
||||
*bufp++ = '['; \
|
||||
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
|
||||
for (int i = 0; i < width; ++i) { \
|
||||
/* only print the value if the current lane is executing */ \
|
||||
if (mask & (1<<i)) \
|
||||
printf(fmt, ((type *)ptr)[i]); \
|
||||
if (mask & (1ull<<i)) \
|
||||
sprintf(tmpBuf, fmt, ((type *)ptr)[i]); \
|
||||
else \
|
||||
printf("((" fmt "))", ((type *)ptr)[i]); \
|
||||
putchar(i != width-1 ? ',' : ']'); \
|
||||
sprintf(tmpBuf, "((" fmt "))", ((type *)ptr)[i]); \
|
||||
APPEND(tmpBuf); \
|
||||
*bufp++ = (i != width-1 ? ',' : ']'); \
|
||||
} \
|
||||
break
|
||||
|
||||
@@ -84,21 +111,23 @@ typedef int Bool;
|
||||
|
||||
@param format Print format string
|
||||
@param types Encoded types of the values being printed.
|
||||
(See lEncodeType()).
|
||||
(See lEncodeType()).
|
||||
@param width Vector width of the compilation target
|
||||
@param mask Current lane mask when the print statemnt is called
|
||||
@param args Array of pointers to the values to be printed
|
||||
*/
|
||||
void __do_print(const char *format, const char *types, int width, int mask,
|
||||
void __do_print(const char *format, const char *types, int width, uint64_t mask,
|
||||
void **args) {
|
||||
if (mask == 0)
|
||||
return;
|
||||
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||
char *bufp = &printString[0];
|
||||
char tmpBuf[256];
|
||||
|
||||
int argCount = 0;
|
||||
while (*format) {
|
||||
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||
// Format strings are just single percent signs.
|
||||
if (*format != '%')
|
||||
putchar(*format);
|
||||
if (*format != '%') {
|
||||
*bufp++ = *format;
|
||||
}
|
||||
else {
|
||||
if (*types) {
|
||||
void *ptr = args[argCount++];
|
||||
@@ -107,17 +136,22 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
// printf() formatting string.
|
||||
switch (*types) {
|
||||
case 'b': {
|
||||
printf("%s", *((Bool *)ptr) ? "true" : "false");
|
||||
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
break;
|
||||
}
|
||||
case 'B': {
|
||||
putchar('[');
|
||||
*bufp++ = '[';
|
||||
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||
break;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
if (mask & (1<<i))
|
||||
printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
if (mask & (1ull << i)) {
|
||||
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
}
|
||||
else
|
||||
printf("_________");
|
||||
putchar(i != width-1 ? ',' : ']');
|
||||
APPEND("_________");
|
||||
*bufp++ = (i != width-1) ? ',' : ']';
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -136,20 +170,24 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
case 'p': PRINT_SCALAR("%p", void *);
|
||||
case 'P': PRINT_VECTOR("%p", void *);
|
||||
default:
|
||||
printf("UNKNOWN TYPE ");
|
||||
putchar(*types);
|
||||
APPEND("UNKNOWN TYPE ");
|
||||
*bufp++ = *types;
|
||||
}
|
||||
++types;
|
||||
}
|
||||
}
|
||||
++format;
|
||||
}
|
||||
|
||||
done:
|
||||
*bufp = '\0';
|
||||
fputs(printString, stdout);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
int __num_cores() {
|
||||
#ifdef _MSC_VER
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
// This is quite a hack. Including all of windows.h to get this definition
|
||||
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||
@@ -48,23 +48,48 @@ declare void @abort() noreturn
|
||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
||||
;; most capable ISA that the curremt system can run.
|
||||
;;
|
||||
;; #ifdef _MSC_VER
|
||||
;; extern void __stdcall __cpuid(int info[4], int infoType);
|
||||
;; #else
|
||||
;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
|
||||
;; backwards compatibility for anyone building ispc with LLVM 3.0
|
||||
;;
|
||||
;; #include <stdint.h>
|
||||
;; #include <stdlib.h>
|
||||
;;
|
||||
;; static void __cpuid(int info[4], int infoType) {
|
||||
;; __asm__ __volatile__ ("cpuid"
|
||||
;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (infoType));
|
||||
;; }
|
||||
;; #endif
|
||||
;;
|
||||
;; /* Save %ebx in case it's the PIC register */
|
||||
;; static void __cpuid_count(int info[4], int level, int count) {
|
||||
;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; "cpuid\n\t"
|
||||
;; "xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (level), "2" (count));
|
||||
;; }
|
||||
;;
|
||||
;; int32_t __get_system_isa() {
|
||||
;; int info[4];
|
||||
;; __cpuid(info, 1);
|
||||
;;
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0)
|
||||
;; return 2; // AVX
|
||||
;; if ((info[2] & (1 << 28)) != 0) {
|
||||
;; if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
;; (info[2] & (1 << 30)) != 0) { // RDRAND
|
||||
;; // So far, so good. AVX2?
|
||||
;; // Call cpuid with eax=7, ecx=0
|
||||
;; int info2[4];
|
||||
;; __cpuid_count(info2, 7, 0);
|
||||
;; if ((info2[1] & (1 << 5)) != 0)
|
||||
;; return 4;
|
||||
;; else
|
||||
;; return 3;
|
||||
;; }
|
||||
;; // Regular AVX
|
||||
;; return 2;
|
||||
;; }
|
||||
;; else if ((info[2] & (1 << 19)) != 0)
|
||||
;; return 1; // SSE4
|
||||
;; else if ((info[3] & (1 << 26)) != 0)
|
||||
@@ -73,36 +98,48 @@ declare void @abort() noreturn
|
||||
;; abort();
|
||||
;; }
|
||||
|
||||
%0 = type { i32, i32, i32, i32 }
|
||||
define i32 @__get_system_isa() nounwind uwtable ssp {
|
||||
entry:
|
||||
%0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
|
||||
%asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
|
||||
%and = and i32 %asmresult5.i, 268435456
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.else13, label %if.then
|
||||
|
||||
define i32 @__get_system_isa() nounwind ssp {
|
||||
%1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%2 = extractvalue %0 %1, 2
|
||||
%3 = extractvalue %0 %1, 3
|
||||
%4 = and i32 %2, 268435456
|
||||
%5 = icmp eq i32 %4, 0
|
||||
br i1 %5, label %6, label %13
|
||||
if.then: ; preds = %entry
|
||||
%1 = and i32 %asmresult5.i, 1610612736
|
||||
%2 = icmp eq i32 %1, 1610612736
|
||||
br i1 %2, label %if.then7, label %return
|
||||
|
||||
; <label>:6 ; preds = %0
|
||||
%7 = and i32 %2, 524288
|
||||
%8 = icmp eq i32 %7, 0
|
||||
br i1 %8, label %9, label %13
|
||||
if.then7: ; preds = %if.then
|
||||
%3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
|
||||
%and10 = lshr i32 %asmresult4.i28, 5
|
||||
%4 = and i32 %and10, 1
|
||||
%5 = add i32 %4, 3
|
||||
br label %return
|
||||
|
||||
; <label>:9 ; preds = %6
|
||||
%10 = and i32 %3, 67108864
|
||||
%11 = icmp eq i32 %10, 0
|
||||
br i1 %11, label %12, label %13
|
||||
if.else13: ; preds = %entry
|
||||
%and15 = and i32 %asmresult5.i, 524288
|
||||
%cmp16 = icmp eq i32 %and15, 0
|
||||
br i1 %cmp16, label %if.else18, label %return
|
||||
|
||||
; <label>:12 ; preds = %9
|
||||
if.else18: ; preds = %if.else13
|
||||
%and20 = and i32 %asmresult6.i, 67108864
|
||||
%cmp21 = icmp eq i32 %and20, 0
|
||||
br i1 %cmp21, label %if.else23, label %return
|
||||
|
||||
if.else23: ; preds = %if.else18
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
; <label>:13 ; preds = %9, %6, %0
|
||||
%.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
|
||||
ret i32 %.0
|
||||
return: ; preds = %if.else18, %if.else13, %if.then7, %if.then
|
||||
%retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
|
||||
ret i32 %retval.0
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; This function is called by each of the dispatch functions we generate;
|
||||
;; it sets @__system_best_isa if it is unset.
|
||||
|
||||
@@ -32,6 +32,11 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; AVX target implementation.
|
||||
|
||||
ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
@@ -249,10 +254,10 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -32,12 +32,16 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 16-wide definitions
|
||||
|
||||
stdlib_core(16)
|
||||
packed_load_and_store(16)
|
||||
scans(16)
|
||||
int64minmax(16)
|
||||
define(`WIDTH',`16')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
include(`builtins-avx-common.ll')
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
@@ -154,51 +158,24 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__max_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
define <16 x float> @__min_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -209,9 +186,57 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
|
||||
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
|
||||
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
%cmp = icmp eq i32 %v, 65535
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
|
||||
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal float ops
|
||||
|
||||
@@ -247,7 +272,7 @@ reduce_equal(16)
|
||||
;; horizontal int32 ops
|
||||
|
||||
define <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||
<16 x i32>) nounwind readnone alwaysinline {
|
||||
<16 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i32> %0, %1
|
||||
ret <16 x i32> %s
|
||||
}
|
||||
@@ -275,11 +300,6 @@ define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
@@ -357,11 +377,6 @@ define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
@@ -375,19 +390,14 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(16, i8, 8)
|
||||
load_and_broadcast(16, i16, 16)
|
||||
load_and_broadcast(16, i32, 32)
|
||||
load_and_broadcast(16, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
load_masked(16, i8, 8, 1)
|
||||
load_masked(16, i16, 16, 2)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %mask to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -405,7 +415,7 @@ define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
@@ -439,6 +449,7 @@ define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
ret <16 x i64> %val
|
||||
}
|
||||
|
||||
masked_load_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
@@ -446,15 +457,15 @@ define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(16, i8, 8)
|
||||
gen_masked_store(16, i16, 16)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <16 x i32> * %0 to i8 *
|
||||
%val = bitcast <16 x i32> %1 to <16 x float>
|
||||
%mask = bitcast <16 x i32> %2 to <16 x float>
|
||||
@@ -476,8 +487,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <16 x i64> * %0 to i8 *
|
||||
%val = bitcast <16 x i64> %1 to <16 x double>
|
||||
|
||||
@@ -515,14 +526,15 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
masked_store_blend_8_16_by_16()
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
|
||||
%oldValue = load <16 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
|
||||
@@ -559,8 +571,8 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||
<4 x double>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <16 x i64>* %ptr, align 8
|
||||
%old = bitcast <16 x i64> %oldValue to <16 x double>
|
||||
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
@@ -618,17 +630,14 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
;; scatter
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
|
||||
gen_scatter(16, i8)
|
||||
gen_scatter(16, i16)
|
||||
gen_scatter(16, i32)
|
||||
gen_scatter(16, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -32,12 +32,16 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 8-wide definitions
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
define(`WIDTH',`8')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
include(`builtins-avx-common.ll')
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
@@ -154,54 +158,49 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__max_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__min_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 255
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -234,7 +233,7 @@ reduce_equal(8)
|
||||
;; horizontal int32 ops
|
||||
|
||||
define <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i32> %0, %1
|
||||
ret <8 x i32> %s
|
||||
}
|
||||
@@ -262,11 +261,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
@@ -310,7 +304,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
|
||||
;; horizontal int64 ops
|
||||
|
||||
define <8 x i64> @__add_varying_int64(<8 x i64>,
|
||||
<8 x i64>) nounwind readnone alwaysinline {
|
||||
<8 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i64> %0, %1
|
||||
ret <8 x i64> %s
|
||||
}
|
||||
@@ -338,11 +332,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<8 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
@@ -356,19 +345,15 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %mask to <8 x float>
|
||||
%floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
|
||||
%retval = bitcast <8 x float> %floatval to <8 x i32>
|
||||
@@ -376,7 +361,7 @@ define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
@@ -395,22 +380,20 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
ret <8 x i64> %val
|
||||
}
|
||||
|
||||
masked_load_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <8 x i32> * %0 to i8 *
|
||||
%val = bitcast <8 x i32> %1 to <8 x float>
|
||||
%mask = bitcast <8 x i32> %2 to <8 x float>
|
||||
@@ -418,8 +401,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <8 x i64> * %0 to i8 *
|
||||
%val = bitcast <8 x i64> %1 to <8 x double>
|
||||
|
||||
@@ -443,14 +426,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
}
|
||||
|
||||
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||
%oldValue = load <8 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||
@@ -464,8 +446,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
%mask = bitcast <8 x i32> %i32mask to <8 x float>
|
||||
|
||||
@@ -514,19 +496,17 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
;; scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
81
builtins/target-avx1-x2.ll
Normal file
81
builtins/target-avx1-x2.ll
Normal file
@@ -0,0 +1,81 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
ifelse(NO_HALF_DECLARES, `1', `', `
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
81
builtins/target-avx1.ll
Normal file
81
builtins/target-avx1.ll
Normal file
@@ -0,0 +1,81 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
ifelse(NO_HALF_DECLARES, `1', `', `
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
132
builtins/target-avx11-x2.ll
Normal file
132
builtins/target-avx11-x2.ll
Normal file
@@ -0,0 +1,132 @@
|
||||
;; Copyright (c) 2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
|
||||
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
|
||||
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
|
||||
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
|
||||
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
'
|
||||
)
|
||||
115
builtins/target-avx11.ll
Normal file
115
builtins/target-avx11.ll
Normal file
@@ -0,0 +1,115 @@
|
||||
;; Copyright (c) 2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
|
||||
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
')
|
||||
561
builtins/target-avx2-x2.ll
Normal file
561
builtins/target-avx2-x2.ll
Normal file
@@ -0,0 +1,561 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `',
|
||||
LLVM_VERSION, `LLVM_3_1', `',
|
||||
`define(`HAVE_GATHER', `1')')
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
|
||||
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
|
||||
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
|
||||
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
|
||||
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
declare void @llvm.trap() noreturn nounwind
|
||||
|
||||
; $1: type
|
||||
; $2: var base name
|
||||
define(`extract_4s', `
|
||||
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: type
|
||||
; $2: var base name
|
||||
define(`extract_8s', `
|
||||
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: element type
|
||||
; $2: ret name
|
||||
; $3: v1
|
||||
; $4: v2
|
||||
define(`assemble_8s', `
|
||||
%$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: element type
|
||||
; $2: ret name
|
||||
; $3: v1
|
||||
; $4: v2
|
||||
; $5: v3
|
||||
; $6: v4
|
||||
define(`assemble_4s', `
|
||||
%$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
assemble_8s($1, $2, $2_1, $2_2)
|
||||
')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)',
|
||||
LLVM_VERSION, `LLVM_3_1', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)', `
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 gathers
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
extract_8s(i32, offsets)
|
||||
extract_8s(i32, vecmask)
|
||||
|
||||
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
|
||||
|
||||
assemble_8s(i32, v, v1, v2)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
|
||||
extract_4s(i32, vecmask)
|
||||
extract_4s(i64, offsets)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i32, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather32_i32(<16 x i32> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_8s(i32, ptrs)
|
||||
extract_8s(i32, vecmask)
|
||||
|
||||
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
|
||||
|
||||
assemble_8s(i32, v, v1, v2)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather64_i32(<16 x i64> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i32, vecmask)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i32, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float gathers
|
||||
|
||||
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
|
||||
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
|
||||
|
||||
define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_8s(i32, offsets)
|
||||
extract_8s(float, mask)
|
||||
|
||||
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
|
||||
|
||||
assemble_8s(float, v, v1, v2)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
|
||||
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
|
||||
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(float, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather32_float(<16 x i32> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_8s(float, mask)
|
||||
extract_8s(i32, ptrs)
|
||||
|
||||
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
|
||||
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
|
||||
|
||||
assemble_8s(float, v, v1, v2)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather64_float(<16 x i64> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
|
||||
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
|
||||
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
|
||||
|
||||
assemble_4s(float, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int64 gathers
|
||||
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__gather32_i64(<16 x i32> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
define <16 x i64> @__gather64_i64(<16 x i64> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double gathers
|
||||
|
||||
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather32_double(<16 x i32> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather64_double(<16 x i64> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
')
|
||||
433
builtins/target-avx2.ll
Normal file
433
builtins/target-avx2.ll
Normal file
@@ -0,0 +1,433 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `',
|
||||
LLVM_VERSION, `LLVM_3_1', `',
|
||||
`define(`HAVE_GATHER', `1')')
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
|
||||
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
declare void @llvm.trap() noreturn nounwind
|
||||
|
||||
define(`extract_4s', `
|
||||
%$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)',
|
||||
LLVM_VERSION, `LLVM_3_1', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)', `
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 gathers
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
|
||||
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
|
||||
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
extract_4s(i32, vecmask)
|
||||
extract_4s(i64, offsets)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather32_i32(<8 x i32> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather64_i32(<8 x i64> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i32, vecmask)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float gathers
|
||||
|
||||
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
|
||||
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
|
||||
|
||||
define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
|
||||
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
|
||||
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather32_float(<8 x i32> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
|
||||
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs, <8 x float> %mask, i8 1)
|
||||
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather64_float(<8 x i64> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int64 gathers
|
||||
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather32_i64(<8 x i32> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather64_i64(<8 x i64> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double gathers
|
||||
|
||||
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather32_double(<8 x i32> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather64_double(<8 x i64> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
')
|
||||
955
builtins/target-generic-1.ll
Normal file
955
builtins/target-generic-1.ll
Normal file
@@ -0,0 +1,955 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Define the standard library builtins for the NOVEC target
|
||||
define(`MASK',`i32')
|
||||
define(`WIDTH',`1')
|
||||
include(`util.m4')
|
||||
; Define some basics for a 1-wide target
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
aossoa()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
|
||||
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %mv = trunc <1 x i32> %mask to <1 x i8>
|
||||
; %notmask = xor <1 x i8> %mv, <i8 -1>
|
||||
; %cleared_old = and <1 x i8> %0, %notmask
|
||||
; %masked_new = and <1 x i8> %1, %mv
|
||||
; %new = or <1 x i8> %cleared_old, %masked_new
|
||||
; ret <1 x i8> %new
|
||||
|
||||
; not doing this the easy way because of problems with LLVM's scalarizer
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i8> %0, i32 0
|
||||
%d1 = extractelement <1 x i8> %1, i32 0
|
||||
%sel = select i1 %cmp, i8 %d0, i8 %d1
|
||||
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
|
||||
ret <1 x i8> %r
|
||||
}
|
||||
|
||||
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %mv = trunc <1 x i32> %mask to <1 x i16>
|
||||
; %notmask = xor <1 x i16> %mv, <i16 -1>
|
||||
; %cleared_old = and <1 x i16> %0, %notmask
|
||||
; %masked_new = and <1 x i16> %1, %mv
|
||||
; %new = or <1 x i16> %cleared_old, %masked_new
|
||||
; ret <1 x i16> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i16> %0, i32 0
|
||||
%d1 = extractelement <1 x i16> %1, i32 0
|
||||
%sel = select i1 %cmp, i16 %d0, i16 %d1
|
||||
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
|
||||
ret <1 x i16> %r
|
||||
|
||||
; ret <1 x i16> %sel
|
||||
}
|
||||
|
||||
|
||||
define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %notmask = xor <1 x i32> %mask, <i32 -1>
|
||||
; %cleared_old = and <1 x i32> %0, %notmask
|
||||
; %masked_new = and <1 x i32> %1, %mask
|
||||
; %new = or <1 x i32> %cleared_old, %masked_new
|
||||
; ret <1 x i32> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
|
||||
; ret <1 x i32> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i32> %0, i32 0
|
||||
%d1 = extractelement <1 x i32> %1, i32 0
|
||||
%sel = select i1 %cmp, i32 %d0, i32 %d1
|
||||
%r = insertelement <1 x i32> undef, i32 %sel, i32 0
|
||||
ret <1 x i32> %r
|
||||
|
||||
}
|
||||
|
||||
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %newmask = zext <1 x i32> %mask to <1 x i64>
|
||||
; %notmask = xor <1 x i64> %newmask, <i64 -1>
|
||||
; %cleared_old = and <1 x i64> %0, %notmask
|
||||
; %masked_new = and <1 x i64> %1, %newmask
|
||||
; %new = or <1 x i64> %cleared_old, %masked_new
|
||||
; ret <1 x i64> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
|
||||
; ret <1 x i64> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i64> %0, i32 0
|
||||
%d1 = extractelement <1 x i64> %1, i32 0
|
||||
%sel = select i1 %cmp, i64 %d0, i64 %d1
|
||||
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
|
||||
ret <1 x i64> %r
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__vselect_float(<1 x float>, <1 x float>,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %v0 = bitcast <1 x float> %0 to <1 x i32>
|
||||
; %v1 = bitcast <1 x float> %1 to <1 x i32>
|
||||
; %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
|
||||
; %rf = bitcast <1 x i32> %r to <1 x float>
|
||||
; ret <1 x float> %rf
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
|
||||
; ret <1 x float> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x float> %0, i32 0
|
||||
%d1 = extractelement <1 x float> %1, i32 0
|
||||
%sel = select i1 %cmp, float %d0, float %d1
|
||||
%r = insertelement <1 x float> undef, float %sel, i32 0
|
||||
ret <1 x float> %r
|
||||
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i8> * %0, align 4
|
||||
%newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask)
|
||||
store <1 x i8> %newval, <1 x i8> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i16> * %0, align 4
|
||||
%newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask)
|
||||
store <1 x i16> %newval, <1 x i16> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i32> * %0, align 4
|
||||
%newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask)
|
||||
store <1 x i32> %newval, <1 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i64> * %0, align 4
|
||||
%newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask)
|
||||
store <1 x i64> %newval, <1 x i64> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
%cmp = icmp eq i32 %v, 1
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
|
||||
%bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
|
||||
%bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
|
||||
%binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
|
||||
%bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
|
||||
ret <1 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
|
||||
%bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <1 x float> %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
|
||||
%bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <1 x float> %binop.i
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
; expecting math lib to provide this
|
||||
declare double @ceil (double) nounwind readnone
|
||||
declare double @floor (double) nounwind readnone
|
||||
declare double @round (double) nounwind readnone
|
||||
;declare float @llvm.sqrt.f32(float %Val)
|
||||
declare double @llvm.sqrt.f64(double %Val)
|
||||
declare float @llvm.sin.f32(float %Val)
|
||||
declare float @llvm.cos.f32(float %Val)
|
||||
declare float @llvm.sqrt.f32(float %Val)
|
||||
declare float @llvm.exp.f32(float %Val)
|
||||
declare float @llvm.log.f32(float %Val)
|
||||
declare float @llvm.pow.f32(float %f, float %e)
|
||||
|
||||
|
||||
|
||||
|
||||
;; stuff that could be in builtins ...
|
||||
|
||||
define(`unary1to1', `
|
||||
%v_0 = extractelement <1 x $1> %0, i32 0
|
||||
%r_0 = call $1 $2($1 %v_0)
|
||||
%ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
|
||||
ret <1 x $1> %ret_0
|
||||
')
|
||||
|
||||
|
||||
|
||||
;; dummy 1 wide vector ops
|
||||
define void
|
||||
@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
|
||||
<1 x float> %v3, <1 x float> * noalias %out0,
|
||||
<1 x float> * noalias %out1, <1 x float> * noalias %out2,
|
||||
<1 x float> * noalias %out3) nounwind alwaysinline {
|
||||
|
||||
store <1 x float> %v0, <1 x float > * %out0
|
||||
store <1 x float> %v1, <1 x float > * %out1
|
||||
store <1 x float> %v2, <1 x float > * %out2
|
||||
store <1 x float> %v3, <1 x float > * %out3
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
|
||||
<1 x float> %v3, <1 x float> * noalias %out0,
|
||||
<1 x float> * noalias %out1, <1 x float> * noalias %out2,
|
||||
<1 x float> * noalias %out3) nounwind alwaysinline {
|
||||
call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> %v3, <1 x float> * %out0,
|
||||
<1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2) {
|
||||
store <1 x float> %v0, <1 x float > * %out0
|
||||
store <1 x float> %v1, <1 x float > * %out1
|
||||
store <1 x float> %v2, <1 x float > * %out2
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2) {
|
||||
call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;; end builtins
|
||||
|
||||
|
||||
define <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @round)
|
||||
}
|
||||
|
||||
define <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @floor)
|
||||
}
|
||||
|
||||
|
||||
define <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @ceil)
|
||||
}
|
||||
|
||||
; To do vector integer min and max, we do the vector compare and then sign
|
||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||
; rest...
|
||||
|
||||
define <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp slt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp slt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; The functions for unsigned ints are similar, just with unsigned
|
||||
; comparison functions...
|
||||
|
||||
define <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ult <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ult i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
|
||||
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
|
||||
%r = extractelement <1 x float> %v, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<1 x float>) nounwind readnone {
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<1 x float>) nounwind readnone {
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %v, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x i32> %vv, i32 0
|
||||
store i32 %v, i32 * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x float> %vv, i32 0
|
||||
store float %v, float * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x i64> %vv, i32 0
|
||||
store i64 %v, i64 * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x double> %vv, i32 0
|
||||
store double %v, double * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
; extracting/reinserting elements because I want to be able to remove vectors later on
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
define <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
;%v_iv = fmul <1 x float> %0, %call
|
||||
;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
;%iv_mul = fmul <1 x float> %call, %two_minus
|
||||
;ret <1 x float> %iv_mul
|
||||
%d = extractelement <1 x float> %0, i32 0
|
||||
%r = fdiv float 1.,%d
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
define <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
|
||||
;ret <1 x float> %call
|
||||
%d = extractelement <1 x float> %0, i32 0
|
||||
%r = call float @llvm.sqrt.f32(float %d)
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
;%v_is = fmul <1 x float> %v, %is
|
||||
;%v_is_is = fmul <1 x float> %v_is, %is
|
||||
;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
;%is_mul = fmul <1 x float> %is, %three_sub
|
||||
;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
;ret <1 x float> %half_scale
|
||||
%s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
|
||||
%r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
|
||||
ret <1 x float> %r
|
||||
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.sin.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float,@llvm.sin.f32)
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.cos.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float, @llvm.cos.f32)
|
||||
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
|
||||
; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
|
||||
; store <1 x float> %s, <1 x float> * %1
|
||||
; ret void
|
||||
%sin = call <1 x float> @__svml_sin (<1 x float> %0)
|
||||
%cos = call <1 x float> @__svml_cos (<1 x float> %0)
|
||||
store <1 x float> %sin, <1 x float> * %1
|
||||
store <1 x float> %cos, <1 x float> * %2
|
||||
ret void
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm_tan_f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
;unasry1to1(float, @llvm.tan.f32)
|
||||
; UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
|
||||
; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
|
||||
; ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm_atan_f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
;unsary1to1(float,@llvm.atan.f32)
|
||||
;UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
;%y = extractelement <1 x float> %0, i32 0
|
||||
;%x = extractelement <1 x float> %1, i32 0
|
||||
;%q = fdiv float %y, %x
|
||||
;%a = call float @llvm.atan.f32 (float %q)
|
||||
;%rv = insertelement <1 x float> undef, float %a, i32 0
|
||||
;ret <1 x float> %rv
|
||||
; UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.exp.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.log.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
%e = extractelement <1 x float> %1, i32 0
|
||||
%s = call float @llvm.pow.f32(float %r,float %e)
|
||||
%rv = insertelement <1 x float> undef, float %s, i32 0
|
||||
ret <1 x float> %rv
|
||||
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
define <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
|
||||
; %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
|
||||
; ret <1 x float> %call
|
||||
%a = extractelement <1 x float> %0, i32 0
|
||||
%b = extractelement <1 x float> %1, i32 0
|
||||
%d = fcmp ogt float %a, %b
|
||||
%r = select i1 %d, float %a, float %b
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
define <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
|
||||
; %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
|
||||
; ret <1 x float> %call
|
||||
%a = extractelement <1 x float> %0, i32 0
|
||||
%b = extractelement <1 x float> %1, i32 0
|
||||
%d = fcmp olt float %a, %b
|
||||
%r = select i1 %d, float %a, float %b
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
|
||||
;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
;ret <1 x double> %ret
|
||||
unary1to1(double, @llvm.sqrt.f64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
|
||||
;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
;ret <1 x double> %ret
|
||||
%a = extractelement <1 x double> %0, i32 0
|
||||
%b = extractelement <1 x double> %1, i32 0
|
||||
%d = fcmp olt double %a, %b
|
||||
%r = select i1 %d, double %a, double %b
|
||||
%rv = insertelement <1 x double> undef, double %r, i32 0
|
||||
ret <1 x double> %rv
|
||||
|
||||
}
|
||||
|
||||
define <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
|
||||
;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
;ret <1 x double> %ret
|
||||
%a = extractelement <1 x double> %0, i32 0
|
||||
%b = extractelement <1 x double> %1, i32 0
|
||||
%d = fcmp ogt double %a, %b
|
||||
%r = select i1 %d, double %a, double %b
|
||||
%rv = insertelement <1 x double> undef, double %r, i32 0
|
||||
ret <1 x double> %rv
|
||||
|
||||
}
|
||||
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%r = fdiv float 1.,%0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__round_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs=call double @round(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs = call double @floor(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs = call double @ceil(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%ret = call float @llvm.sqrt.f32(float %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%ret = call double @llvm.sqrt.f64(double %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%s = call float @__sqrt_uniform_float(float %0)
|
||||
%r = call float @__rcp_uniform_float(float %s)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fastmath
|
||||
|
||||
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
; no-op
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
%d = fcmp ogt float %0, %1
|
||||
%r = select i1 %d, float %0, float %1
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
%d = fcmp olt float %0, %1
|
||||
%r = select i1 %d, float %0, float %1
|
||||
ret float %r
|
||||
|
||||
}
|
||||
define double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
|
||||
%d = fcmp ogt double %0, %1
|
||||
%r = select i1 %d, double %0, double %1
|
||||
ret double %r
|
||||
|
||||
}
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
|
||||
%d = fcmp olt double %0, %1
|
||||
%r = select i1 %d, double %0, double %1
|
||||
ret double %r
|
||||
|
||||
}
|
||||
|
||||
define_shuffles()
|
||||
|
||||
ctlztz()
|
||||
|
||||
define_prefetches()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
34
builtins/target-generic-16.ll
Normal file
34
builtins/target-generic-16.ll
Normal file
@@ -0,0 +1,34 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`16')
|
||||
include(`target-generic-common.ll')
|
||||
|
||||
33
builtins/target-generic-32.ll
Normal file
33
builtins/target-generic-32.ll
Normal file
@@ -0,0 +1,33 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`32')
|
||||
include(`target-generic-common.ll')
|
||||
34
builtins/target-generic-4.ll
Normal file
34
builtins/target-generic-4.ll
Normal file
@@ -0,0 +1,34 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`4')
|
||||
include(`target-generic-common.ll')
|
||||
|
||||
33
builtins/target-generic-64.ll
Normal file
33
builtins/target-generic-64.ll
Normal file
@@ -0,0 +1,33 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`64')
|
||||
include(`target-generic-common.ll')
|
||||
34
builtins/target-generic-8.ll
Normal file
34
builtins/target-generic-8.ll
Normal file
@@ -0,0 +1,34 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`8')
|
||||
include(`target-generic-common.ll')
|
||||
|
||||
381
builtins/target-generic-common.ll
Normal file
381
builtins/target-generic-common.ll
Normal file
@@ -0,0 +1,381 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
|
||||
|
||||
define(`MASK',`i1')
|
||||
define(`HAVE_GATHER',`1')
|
||||
define(`HAVE_SCATTER',`1')
|
||||
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
scans()
|
||||
reduce_equal(WIDTH)
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; broadcast/rotate/shuffle
|
||||
|
||||
declare <WIDTH x float> @__smear_float(float) nounwind readnone
|
||||
declare <WIDTH x double> @__smear_double(double) nounwind readnone
|
||||
declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
|
||||
declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
|
||||
declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
|
||||
declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__setzero_float() nounwind readnone
|
||||
declare <WIDTH x double> @__setzero_double() nounwind readnone
|
||||
declare <WIDTH x i8> @__setzero_i8() nounwind readnone
|
||||
declare <WIDTH x i16> @__setzero_i16() nounwind readnone
|
||||
declare <WIDTH x i32> @__setzero_i32() nounwind readnone
|
||||
declare <WIDTH x i64> @__setzero_i64() nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__undef_float() nounwind readnone
|
||||
declare <WIDTH x double> @__undef_double() nounwind readnone
|
||||
declare <WIDTH x i8> @__undef_i8() nounwind readnone
|
||||
declare <WIDTH x i16> @__undef_i16() nounwind readnone
|
||||
declare <WIDTH x i32> @__undef_i32() nounwind readnone
|
||||
declare <WIDTH x i64> @__undef_i64() nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
|
||||
declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
|
||||
declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
|
||||
declare <WIDTH x i16> @__broadcast_i16(<WIDTH x i16>, i32) nounwind readnone
|
||||
declare <WIDTH x i32> @__broadcast_i32(<WIDTH x i32>, i32) nounwind readnone
|
||||
declare <WIDTH x i64> @__broadcast_i64(<WIDTH x i64>, i32) nounwind readnone
|
||||
|
||||
declare <WIDTH x i8> @__rotate_i8(<WIDTH x i8>, i32) nounwind readnone
|
||||
declare <WIDTH x i16> @__rotate_i16(<WIDTH x i16>, i32) nounwind readnone
|
||||
declare <WIDTH x float> @__rotate_float(<WIDTH x float>, i32) nounwind readnone
|
||||
declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
|
||||
declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
|
||||
declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
|
||||
|
||||
declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i16> @__shuffle_i16(<WIDTH x i16>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i16> @__shuffle2_i16(<WIDTH x i16>, <WIDTH x i16>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x float> @__shuffle_float(<WIDTH x float>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x float> @__shuffle2_float(<WIDTH x float>, <WIDTH x float>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__shuffle_i32(<WIDTH x i32>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__shuffle2_i32(<WIDTH x i32>, <WIDTH x i32>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x double> @__shuffle_double(<WIDTH x double>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x double> @__shuffle2_double(<WIDTH x double>,
|
||||
<WIDTH x double>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i64> @__shuffle_i64(<WIDTH x i64>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i64> @__shuffle2_i64(<WIDTH x i64>, <WIDTH x i64>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; aos/soa
|
||||
|
||||
declare void @__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
|
||||
<WIDTH x float> %v2, float * noalias %p) nounwind
|
||||
declare void @__aos_to_soa3_float(float * noalias %p, <WIDTH x float> * %out0,
|
||||
<WIDTH x float> * %out1, <WIDTH x float> * %out2) nounwind
|
||||
declare void @__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
|
||||
<WIDTH x float> %v2, <WIDTH x float> %v3,
|
||||
float * noalias %p) nounwind
|
||||
declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias %out0,
|
||||
<WIDTH x float> * noalias %out1,
|
||||
<WIDTH x float> * noalias %out2,
|
||||
<WIDTH x float> * noalias %out3) nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
|
||||
declare void @__fastmath() nounwind
|
||||
|
||||
;; round/floor/ceil
|
||||
|
||||
declare float @__round_uniform_float(float) nounwind readnone
|
||||
declare float @__floor_uniform_float(float) nounwind readnone
|
||||
declare float @__ceil_uniform_float(float) nounwind readnone
|
||||
|
||||
declare double @__round_uniform_double(double) nounwind readnone
|
||||
declare double @__floor_uniform_double(double) nounwind readnone
|
||||
declare double @__ceil_uniform_double(double) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
;; min/max
|
||||
|
||||
declare float @__max_uniform_float(float, float) nounwind readnone
|
||||
declare float @__min_uniform_float(float, float) nounwind readnone
|
||||
declare i32 @__min_uniform_int32(i32, i32) nounwind readnone
|
||||
declare i32 @__max_uniform_int32(i32, i32) nounwind readnone
|
||||
declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone
|
||||
declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone
|
||||
declare i64 @__min_uniform_int64(i64, i64) nounwind readnone
|
||||
declare i64 @__max_uniform_int64(i64, i64) nounwind readnone
|
||||
declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone
|
||||
declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone
|
||||
declare double @__min_uniform_double(double, double) nounwind readnone
|
||||
declare double @__max_uniform_double(double, double) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
|
||||
<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
|
||||
<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
|
||||
<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
|
||||
<WIDTH x double>) nounwind readnone
|
||||
|
||||
;; sqrt/rsqrt/rcp
|
||||
|
||||
declare float @__rsqrt_uniform_float(float) nounwind readnone
|
||||
declare float @__rcp_uniform_float(float) nounwind readnone
|
||||
declare float @__sqrt_uniform_float(float) nounwind readnone
|
||||
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
|
||||
declare double @__sqrt_uniform_double(double) nounwind readnone
|
||||
declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
;; bit ops
|
||||
|
||||
declare i32 @__popcnt_int32(i32) nounwind readnone
|
||||
declare i64 @__popcnt_int64(i64) nounwind readnone
|
||||
|
||||
declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
|
||||
declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
|
||||
declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
|
||||
declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
|
||||
|
||||
;; svml
|
||||
|
||||
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||
; or, use the macro to call the 4-wide ones twice with our 8-wide
|
||||
; vectors...
|
||||
|
||||
declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
|
||||
declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
|
||||
declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_log(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reductions
|
||||
|
||||
declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone
|
||||
declare i1 @__any(<WIDTH x i1>) nounwind readnone
|
||||
declare i1 @__all(<WIDTH x i1>) nounwind readnone
|
||||
declare i1 @__none(<WIDTH x i1>) nounwind readnone
|
||||
|
||||
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone
|
||||
|
||||
declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
|
||||
|
||||
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
|
||||
|
||||
declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone
|
||||
declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone
|
||||
declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
|
||||
|
||||
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
|
||||
declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
', `
|
||||
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i8> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
|
||||
store <WIDTH x i8> %v1, <WIDTH x i8> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i16> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
|
||||
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i32> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
|
||||
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x float> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
|
||||
store <WIDTH x float> %v1, <WIDTH x float> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
|
||||
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i64> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
|
||||
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
|
||||
<WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x double> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
|
||||
store <WIDTH x double> %v1, <WIDTH x double> * %0
|
||||
ret void
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
define(`gather_scatter', `
|
||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
|
||||
declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
|
||||
<WIDTH x i1>) nounwind
|
||||
')
|
||||
|
||||
gather_scatter(i8)
|
||||
gather_scatter(i16)
|
||||
gather_scatter(i32)
|
||||
gather_scatter(float)
|
||||
gather_scatter(i64)
|
||||
gather_scatter(double)
|
||||
|
||||
declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
|
||||
<WIDTH x i1>) nounwind
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; prefetch
|
||||
|
||||
declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
||||
|
||||
@@ -29,6 +29,12 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -36,12 +36,24 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; standard 8-wide definitions from m4 macros
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
define(`WIDTH',`8')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
include(`builtins-sse2-common.ll')
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
@@ -283,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
@@ -297,11 +309,66 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp eq i32 %v, 255
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define <4 x float> @__vec4_add_float(<4 x float> %v0,
|
||||
<4 x float> %v1) nounwind readnone alwaysinline {
|
||||
<4 x float> %v1) nounwind readnone alwaysinline {
|
||||
%v = fadd <4 x float> %v0, %v1
|
||||
ret <4 x float> %v
|
||||
}
|
||||
@@ -325,7 +392,7 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
%v = add <4 x i32> %v0, %v1
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
@@ -348,11 +415,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
@@ -385,7 +447,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
}
|
||||
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
@@ -420,28 +482,30 @@ reduce_equal(8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
load_masked(8, i32, 32, 4)
|
||||
load_masked(8, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float rounding
|
||||
@@ -545,23 +609,23 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(8, i32, 32)
|
||||
gen_masked_store(8, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <8 x i32> * %0, align 4
|
||||
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
|
||||
store <8 x i32> %newval, <8 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
|
||||
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||
@@ -604,6 +668,8 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -33,12 +33,24 @@
|
||||
;; Define the standard library builtins for the SSE2 target
|
||||
|
||||
; Define some basics for a 4-wide target
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
int64minmax(4)
|
||||
define(`WIDTH',`4')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
include(`builtins-sse2-common.ll')
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
@@ -144,7 +156,7 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
|
||||
; from %1, and otherwise return the value from %0.
|
||||
|
||||
define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
|
||||
%cleared_old = and <4 x i32> %0, %notmask
|
||||
%masked_new = and <4 x i32> %1, %mask
|
||||
@@ -153,7 +165,7 @@ define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
||||
}
|
||||
|
||||
define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%v0 = bitcast <4 x float> %0 to <4 x i32>
|
||||
%v1 = bitcast <4 x float> %1 to <4 x i32>
|
||||
%r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
|
||||
@@ -227,10 +239,32 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 15
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
@@ -269,18 +303,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
@@ -337,16 +366,16 @@ reduce_equal(4)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <4 x i32> * %0, align 4
|
||||
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
|
||||
store <4 x i32> %newval, <4 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
|
||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||
@@ -388,6 +417,8 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
}
|
||||
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
@@ -539,35 +570,37 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
load_masked(4, i8, 8, 1)
|
||||
load_masked(4, i16, 16, 2)
|
||||
load_masked(4, i32, 32, 4)
|
||||
load_masked(4, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
@@ -29,6 +29,12 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -36,12 +36,24 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; standard 8-wide definitions from m4 macros
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
define(`WIDTH',`8')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
include(`builtins-sse4-common.ll')
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
@@ -209,13 +221,13 @@ define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly al
|
||||
; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
@@ -225,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
@@ -239,7 +251,62 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp eq i32 %v, 255
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
@@ -252,7 +319,7 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
%v = add <4 x i32> %v0, %v1
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
@@ -275,11 +342,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
|
||||
}
|
||||
@@ -312,7 +374,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
}
|
||||
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
@@ -347,28 +409,30 @@ reduce_equal(8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
load_masked(8, i32, 32, 4)
|
||||
load_masked(8, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float rounding
|
||||
@@ -431,18 +495,18 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(8, i32, 32)
|
||||
gen_masked_store(8, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
; do two 4-wide blends with blendvps
|
||||
%mask_as_float = bitcast <8 x i32> %mask to <8 x float>
|
||||
%mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
|
||||
@@ -471,8 +535,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
; implement this as 4 blends of <4 x i32>s, which are actually bitcast
|
||||
; <2 x i64>s...
|
||||
|
||||
@@ -538,6 +602,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -33,12 +33,24 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; Define common 4-wide stuff
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
int64minmax(4)
|
||||
define(`WIDTH',`4')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
include(`builtins-sse4-common.ll')
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
@@ -259,10 +271,32 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 15
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
@@ -300,18 +334,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
@@ -371,8 +400,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
||||
%oldValue = load <4 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||
@@ -386,8 +415,8 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
||||
|
||||
@@ -438,35 +467,39 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
load_masked(4, i8, 8, 1)
|
||||
load_masked(4, i16, 16, 2)
|
||||
load_masked(4, i32, 32, 4)
|
||||
load_masked(4, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
4046
builtins/util.m4
Normal file
4046
builtins/util.m4
Normal file
File diff suppressed because it is too large
Load Diff
4942
cbackend.cpp
Normal file
4942
cbackend.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,7 @@ syn keyword ispcStatement cbreak ccontinue creturn launch print reference soa sy
|
||||
syn keyword ispcConditional cif
|
||||
syn keyword ispcRepeat cdo cfor cwhile
|
||||
syn keyword ispcBuiltin programCount programIndex
|
||||
syn keyword ispcType export int8 int16 int32 int64
|
||||
syn keyword ispcType export uniform varying int8 int16 int32 int64
|
||||
|
||||
" Default highlighting
|
||||
command -nargs=+ HiLink hi def link <args>
|
||||
|
||||
8
contrib/ispc.vim.README
Normal file
8
contrib/ispc.vim.README
Normal file
@@ -0,0 +1,8 @@
|
||||
To install vim syntax highlighting for ispc files:
|
||||
|
||||
1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
|
||||
2) Create a filetype for ispc files to correspond to that syntax file
|
||||
To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
|
||||
|
||||
au BufRead,BufNewFile *.ispc set filetype=ispc
|
||||
|
||||
293
ctx.h
293
ctx.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2013, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,21 +28,32 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file ctx.h
|
||||
@brief Declaration of the FunctionEmitContext class
|
||||
@brief %Declaration of the FunctionEmitContext class
|
||||
*/
|
||||
|
||||
#ifndef ISPC_CTX_H
|
||||
#define ISPC_CTX_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <llvm/InstrTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
#include <map>
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
||||
#include <llvm/InstrTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#else
|
||||
#include <llvm/IR/InstrTypes.h>
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#endif
|
||||
#if defined(LLVM_3_1)
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#else
|
||||
#include <llvm/DebugInfo.h>
|
||||
#include <llvm/DIBuilder.h>
|
||||
#endif
|
||||
|
||||
struct CFInfo;
|
||||
|
||||
@@ -64,7 +75,7 @@ public:
|
||||
@param firstStmtPos Source file position of the first statement in the
|
||||
function
|
||||
*/
|
||||
FunctionEmitContext(Function *function, Symbol *funSym,
|
||||
FunctionEmitContext(Function *function, Symbol *funSym,
|
||||
llvm::Function *llvmFunction,
|
||||
SourcePos firstStmtPos);
|
||||
~FunctionEmitContext();
|
||||
@@ -76,9 +87,9 @@ public:
|
||||
/** @name Current basic block management
|
||||
@{
|
||||
*/
|
||||
/** Returns the current basic block pointer */
|
||||
/** Returns the current basic block pointer */
|
||||
llvm::BasicBlock *GetCurrentBasicBlock();
|
||||
|
||||
|
||||
/** Set the given llvm::BasicBlock to be the basic block to emit
|
||||
forthcoming instructions into. */
|
||||
void SetCurrentBasicBlock(llvm::BasicBlock *bblock);
|
||||
@@ -86,7 +97,7 @@ public:
|
||||
/** @name Mask management
|
||||
@{
|
||||
*/
|
||||
/** Returns the mask value at entry to the current function. */
|
||||
/** Returns the mask value at entry to the current function. */
|
||||
llvm::Value *GetFunctionMask();
|
||||
|
||||
/** Returns the mask value corresponding to "varying" control flow
|
||||
@@ -95,12 +106,12 @@ public:
|
||||
llvm::Value *GetInternalMask();
|
||||
|
||||
/** Returns the complete current mask value--i.e. the logical AND of
|
||||
the function entry mask and the internal mask. */
|
||||
the function entry mask and the internal mask. */
|
||||
llvm::Value *GetFullMask();
|
||||
|
||||
/** Provides the alloca'd pointer to memory to store the full function
|
||||
mask. This is only used to wire up the __mask builtin variable. */
|
||||
void SetMaskPointer(llvm::Value *p);
|
||||
/** Returns a pointer to storage in memory that stores the current full
|
||||
mask. */
|
||||
llvm::Value *GetFullMaskPointer();
|
||||
|
||||
/** Provides the value of the mask at function entry */
|
||||
void SetFunctionMask(llvm::Value *val);
|
||||
@@ -148,22 +159,21 @@ public:
|
||||
'continue' statements should jump to (if all running lanes want to
|
||||
break or continue), uniformControlFlow indicates whether the loop
|
||||
condition is 'uniform'. */
|
||||
void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
|
||||
void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
|
||||
bool uniformControlFlow);
|
||||
|
||||
/** Informs FunctionEmitContext of the value of the mask at the start
|
||||
of a loop body. */
|
||||
void SetLoopMask(llvm::Value *mask);
|
||||
of a loop body or switch statement. */
|
||||
void SetBlockEntryMask(llvm::Value *mask);
|
||||
|
||||
/** Informs FunctionEmitContext that code generation for a loop is
|
||||
finished. */
|
||||
void EndLoop();
|
||||
|
||||
/** Indicates that code generation for a 'foreach' or 'foreach_tiled'
|
||||
loop is about to start. The provided basic block pointer indicates
|
||||
where control flow should go if a 'continue' statement is executed
|
||||
in the loop. */
|
||||
void StartForeach(llvm::BasicBlock *continueTarget);
|
||||
/** Indicates that code generation for a 'foreach', 'foreach_tiled',
|
||||
'foreach_active', or 'foreach_unique' loop is about to start. */
|
||||
enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
|
||||
void StartForeach(ForeachType ft);
|
||||
void EndForeach();
|
||||
|
||||
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
|
||||
@@ -186,12 +196,73 @@ public:
|
||||
previous iteration. */
|
||||
void RestoreContinuedLanes();
|
||||
|
||||
/** Indicates that code generation for a "switch" statement is about to
|
||||
start. isUniform indicates whether the "switch" value is uniform,
|
||||
and bbAfterSwitch gives the basic block immediately following the
|
||||
"switch" statement. (For example, if the switch condition is
|
||||
uniform, we jump here upon executing a "break" statement.) */
|
||||
void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
|
||||
/** Indicates the end of code generation for a "switch" statement. */
|
||||
void EndSwitch();
|
||||
|
||||
/** Emits code for a "switch" statement in the program.
|
||||
@param expr Gives the value of the expression after the "switch"
|
||||
@param defaultBlock Basic block to execute for the "default" case. This
|
||||
should be NULL if there is no "default" label inside
|
||||
the switch.
|
||||
@param caseBlocks vector that stores the mapping from label values
|
||||
after "case" statements to basic blocks corresponding
|
||||
to the "case" labels.
|
||||
@param nextBlocks For each basic block for a "case" or "default"
|
||||
label, this gives the basic block for the
|
||||
immediately-following "case" or "default" label (or
|
||||
the basic block after the "switch" statement for the
|
||||
last label.)
|
||||
*/
|
||||
void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
|
||||
|
||||
/** Generates code for a "default" label after a "switch" statement.
|
||||
The checkMask parameter indicates whether additional code should be
|
||||
generated to check to see if the execution mask is all off after
|
||||
the default label (in which case a jump to the following label will
|
||||
be issued. */
|
||||
void EmitDefaultLabel(bool checkMask, SourcePos pos);
|
||||
|
||||
/** Generates code for a "case" label after a "switch" statement. See
|
||||
the documentation for EmitDefaultLabel() for discussion of the
|
||||
checkMask parameter. */
|
||||
void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
|
||||
|
||||
/** Returns the current number of nested levels of 'varying' control
|
||||
flow */
|
||||
int VaryingCFDepth() const;
|
||||
|
||||
bool InForeachLoop() const;
|
||||
|
||||
/** Temporarily disables emission of performance warnings from gathers
|
||||
and scatters from subsequent code. */
|
||||
void DisableGatherScatterWarnings();
|
||||
|
||||
/** Reenables emission of gather/scatter performance warnings. */
|
||||
void EnableGatherScatterWarnings();
|
||||
|
||||
void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
|
||||
|
||||
/** Step through the code and find label statements; create a basic
|
||||
block for each one, so that subsequent calls to
|
||||
GetLabeledBasicBlock() return the corresponding basic block. */
|
||||
void InitializeLabelMap(Stmt *code);
|
||||
|
||||
/** If there is a label in the function with the given name, return the
|
||||
new basic block that it starts. */
|
||||
llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
|
||||
|
||||
/** Returns a vector of all labels in the context. This is
|
||||
simply the key set of the labelMap */
|
||||
std::vector<std::string> GetLabels();
|
||||
|
||||
/** Called to generate code for 'return' statement; value is the
|
||||
expression in the return statement (if non-NULL), and
|
||||
doCoherenceCheck indicates whether instructions should be generated
|
||||
@@ -201,7 +272,7 @@ public:
|
||||
/** @} */
|
||||
|
||||
/** @name Small helper/utility routines
|
||||
@{
|
||||
@{
|
||||
*/
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i1 value that indicates if any of the mask lanes are on. */
|
||||
@@ -212,7 +283,11 @@ public:
|
||||
llvm::Value *All(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i32 value wherein the i'th bit is on if and only if the i'th lane
|
||||
i1 value that indicates if all of the mask lanes are off. */
|
||||
llvm::Value *None(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i64 value wherein the i'th bit is on if and only if the i'th lane
|
||||
of the mask is on. */
|
||||
llvm::Value *LaneMask(llvm::Value *mask);
|
||||
|
||||
@@ -220,6 +295,10 @@ public:
|
||||
that indicates whether the two masks are equal. */
|
||||
llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
|
||||
|
||||
/** Generate ConstantVector, which contains ProgramIndex, i.e.
|
||||
< i32 0, i32 1, i32 2, i32 3> */
|
||||
llvm::Value *ProgramIndexVector(bool is32bits = true);
|
||||
|
||||
/** Given a string, create an anonymous global variable to hold its
|
||||
value and return the pointer to the string. */
|
||||
llvm::Value *GetStringPtr(const std::string &str);
|
||||
@@ -257,7 +336,7 @@ public:
|
||||
llvm::Instruction for convenience; in calling code we often have
|
||||
Instructions stored using Value pointers; the code here returns
|
||||
silently if it's not actually given an instruction. */
|
||||
void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL,
|
||||
void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL,
|
||||
llvm::DIScope *scope = NULL);
|
||||
|
||||
/** Inform the debugging information generation code that a new scope
|
||||
@@ -278,7 +357,7 @@ public:
|
||||
|
||||
/** Emits debugging information for the function parameter represented
|
||||
by sym. */
|
||||
void EmitFunctionParameterDebugInfo(Symbol *sym);
|
||||
void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
|
||||
/** @} */
|
||||
|
||||
/** @name IR instruction emission
|
||||
@@ -286,7 +365,7 @@ public:
|
||||
instructions. See the LLVM assembly language reference manual
|
||||
(http://llvm.org/docs/LangRef.html) and the LLVM doxygen documentaion
|
||||
(http://llvm.org/doxygen) for more information. Here we will only
|
||||
document significant generalizations to the functionality of the
|
||||
document significant generalizations to the functionality of the
|
||||
corresponding basic LLVM instructions.
|
||||
|
||||
Beyond actually emitting the instruction, the implementations of
|
||||
@@ -302,7 +381,7 @@ public:
|
||||
this also handles applying the given operation to the vector
|
||||
elements. */
|
||||
llvm::Value *BinaryOperator(llvm::Instruction::BinaryOps inst,
|
||||
llvm::Value *v0, llvm::Value *v1,
|
||||
llvm::Value *v0, llvm::Value *v1,
|
||||
const char *name = NULL);
|
||||
|
||||
/** Emit the "not" operator. Like BinaryOperator(), this also handles
|
||||
@@ -312,7 +391,7 @@ public:
|
||||
/** Emit a comparison instruction. If the operands are VectorTypes,
|
||||
then a value for the corresponding boolean VectorType is
|
||||
returned. */
|
||||
llvm::Value *CmpInst(llvm::Instruction::OtherOps inst,
|
||||
llvm::Value *CmpInst(llvm::Instruction::OtherOps inst,
|
||||
llvm::CmpInst::Predicate pred,
|
||||
llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
|
||||
|
||||
@@ -320,25 +399,35 @@ public:
|
||||
array, for pointer types). */
|
||||
llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
|
||||
|
||||
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
/** Given two integer-typed values (but possibly one vector and the
|
||||
other not, and or of possibly-different bit-widths), update their
|
||||
values as needed so that the two have the same (more general)
|
||||
type. */
|
||||
void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
|
||||
|
||||
/** Create a new slice pointer out of the given pointer to an soa type
|
||||
and an integer offset to a slice within that type. */
|
||||
llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
|
||||
|
||||
/** These GEP methods are generalizations of the standard ones in LLVM;
|
||||
they support both uniform and varying basePtr values as well as
|
||||
uniform and varying index values (arrays of indices). Varying base
|
||||
@@ -359,7 +448,8 @@ public:
|
||||
the type of the pointer, though it may be NULL if the base pointer
|
||||
is uniform. */
|
||||
llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
|
||||
const Type *ptrType, const char *name = NULL);
|
||||
const Type *ptrType, const char *name = NULL,
|
||||
const PointerType **resultPtrType = NULL);
|
||||
|
||||
/** Load from the memory location(s) given by lvalue, using the given
|
||||
mask. The lvalue may be varying, in which case this corresponds to
|
||||
@@ -376,9 +466,9 @@ public:
|
||||
allocated at the given alignment. By default, the alloca
|
||||
instruction is added at the start of the function in the entry
|
||||
basic block; if it should be added to the current basic block, then
|
||||
the atEntryBlock parameter should be false. */
|
||||
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType,
|
||||
const char *name = NULL, int align = 0,
|
||||
the atEntryBlock parameter should be false. */
|
||||
llvm::Value *AllocaInst(llvm::Type *llvmType,
|
||||
const char *name = NULL, int align = 0,
|
||||
bool atEntryBlock = true);
|
||||
|
||||
/** Standard store instruction; for this variant, the lvalue must be a
|
||||
@@ -390,7 +480,14 @@ public:
|
||||
varying, the given storeMask is used to mask the stores so that
|
||||
they only execute for the active program instances. */
|
||||
void StoreInst(llvm::Value *value, llvm::Value *ptr,
|
||||
llvm::Value *storeMask, const Type *ptrType);
|
||||
llvm::Value *storeMask, const Type *valueType,
|
||||
const Type *ptrType);
|
||||
|
||||
/** Copy count bytes of memory from the location pointed to by src to
|
||||
the location pointed to by dest. (src and dest must not be
|
||||
overlapping.) */
|
||||
void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
|
||||
llvm::Value *align = NULL);
|
||||
|
||||
void BranchInst(llvm::BasicBlock *block);
|
||||
void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
|
||||
@@ -404,10 +501,20 @@ public:
|
||||
/** This convenience method maps to an llvm::InsertElementInst if the
|
||||
given value is a llvm::VectorType, and to an llvm::InsertValueInst
|
||||
otherwise. */
|
||||
llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
||||
/** This convenience method maps to an llvm::ShuffleVectorInst. */
|
||||
llvm::Value *ShuffleInst(llvm::Value *v1, llvm::Value *v2, llvm::Value *mask,
|
||||
const char *name = NULL);
|
||||
|
||||
/** This convenience method to generate broadcast pattern. It takes a value
|
||||
and a vector type. Type of the value must match element type of the
|
||||
vector. */
|
||||
llvm::Value *BroadcastValue(llvm::Value *v, llvm::Type *vecType,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::PHINode *PhiNode(llvm::Type *type, int count,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
|
||||
llvm::Value *val1, const char *name = NULL);
|
||||
@@ -433,7 +540,7 @@ public:
|
||||
|
||||
/** Launch an asynchronous task to run the given function, passing it
|
||||
he given argument values. */
|
||||
llvm::Value *LaunchInst(llvm::Value *callee,
|
||||
llvm::Value *LaunchInst(llvm::Value *callee,
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount);
|
||||
|
||||
@@ -446,6 +553,9 @@ private:
|
||||
/** Pointer to the Function for which we're currently generating code. */
|
||||
Function *function;
|
||||
|
||||
/** LLVM function representation for the current function. */
|
||||
llvm::Function *llvmFunction;
|
||||
|
||||
/** The basic block into which we add any alloca instructions that need
|
||||
to go at the very start of the function. */
|
||||
llvm::BasicBlock *allocaBlock;
|
||||
@@ -475,14 +585,14 @@ private:
|
||||
for error messages and debugging symbols. */
|
||||
SourcePos funcStartPos;
|
||||
|
||||
/** If currently in a loop body, the value of the mask at the start of
|
||||
the loop. */
|
||||
llvm::Value *loopMask;
|
||||
/** If currently in a loop body or switch statement, the value of the
|
||||
mask at the start of it. */
|
||||
llvm::Value *blockEntryMask;
|
||||
|
||||
/** If currently in a loop body, this is a pointer to memory to store a
|
||||
mask value that represents which of the lanes have executed a
|
||||
'break' statement. If we're not in a loop body, this should be
|
||||
NULL. */
|
||||
/** If currently in a loop body or switch statement, this is a pointer
|
||||
to memory to store a mask value that represents which of the lanes
|
||||
have executed a 'break' statement. If we're not in a loop body or
|
||||
switch, this should be NULL. */
|
||||
llvm::Value *breakLanesPtr;
|
||||
|
||||
/** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
|
||||
@@ -490,16 +600,49 @@ private:
|
||||
'continue' statement. */
|
||||
llvm::Value *continueLanesPtr;
|
||||
|
||||
/** If we're inside a loop, this gives the basic block immediately
|
||||
after the current loop, which we will jump to if all of the lanes
|
||||
have executed a break statement or are otherwise done with the
|
||||
loop. */
|
||||
/** If we're inside a loop or switch statement, this gives the basic
|
||||
block immediately after the current loop or switch, which we will
|
||||
jump to if all of the lanes have executed a break statement or are
|
||||
otherwise done with it. */
|
||||
llvm::BasicBlock *breakTarget;
|
||||
|
||||
/** If we're inside a loop, this gives the block to jump to if all of
|
||||
the running lanes have executed a 'continue' statement. */
|
||||
llvm::BasicBlock *continueTarget;
|
||||
|
||||
/** @name Switch statement state
|
||||
|
||||
These variables store various state that's active when we're
|
||||
generating code for a switch statement. They should all be NULL
|
||||
outside of a switch.
|
||||
@{
|
||||
*/
|
||||
|
||||
/** The value of the expression used to determine which case in the
|
||||
statements after the switch to execute. */
|
||||
llvm::Value *switchExpr;
|
||||
|
||||
/** Map from case label numbers to the basic block that will hold code
|
||||
for that case. */
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
|
||||
|
||||
/** The basic block of code to run for the "default" label in the
|
||||
switch statement. */
|
||||
llvm::BasicBlock *defaultBlock;
|
||||
|
||||
/** For each basic block for the code for cases (and the default label,
|
||||
if present), this map gives the basic block for the immediately
|
||||
following case/default label. */
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
|
||||
|
||||
/** Records whether the switch condition was uniform; this is a
|
||||
distinct notion from whether the switch represents uniform or
|
||||
varying control flow; we may have varying control flow from a
|
||||
uniform switch condition if there is a 'break' inside the switch
|
||||
that's under varying control flow. */
|
||||
bool switchConditionWasUniform;
|
||||
/** @} */
|
||||
|
||||
/** A pointer to memory that records which of the program instances
|
||||
have executed a 'return' statement (and are thus really truly done
|
||||
running any more instructions in this functions. */
|
||||
@@ -518,12 +661,12 @@ private:
|
||||
std::vector<CFInfo *> controlFlowInfo;
|
||||
|
||||
/** DIFile object corresponding to the source file where the current
|
||||
function was defined (used for debugging info0. */
|
||||
function was defined (used for debugging info). */
|
||||
llvm::DIFile diFile;
|
||||
|
||||
/** DISubprogram corresponding to this function (used for debugging
|
||||
info). */
|
||||
llvm::DISubprogram diFunction;
|
||||
llvm::DISubprogram diSubprogram;
|
||||
|
||||
/** These correspond to the current set of nested scopes in the
|
||||
function. */
|
||||
@@ -537,23 +680,43 @@ private:
|
||||
tasks launched from the current function. */
|
||||
llvm::Value *launchGroupHandlePtr;
|
||||
|
||||
/** Nesting count of the number of times calling code has disabled (and
|
||||
not yet reenabled) gather/scatter performance warnings. */
|
||||
int disableGSWarningCount;
|
||||
|
||||
std::map<std::string, llvm::BasicBlock *> labelMap;
|
||||
|
||||
static bool initLabelBBlocks(ASTNode *node, void *data);
|
||||
|
||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||
static void addGSMetadata(llvm::Value *inst, SourcePos pos);
|
||||
bool ifsInLoopAllUniform() const;
|
||||
bool ifsInCFAllUniform(int cfType) const;
|
||||
void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
|
||||
llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
|
||||
|
||||
llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
|
||||
llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
|
||||
const Type *ptrType);
|
||||
|
||||
void restoreMaskGivenReturns(llvm::Value *oldMask);
|
||||
void addSwitchMaskCheck(llvm::Value *mask);
|
||||
bool inSwitchStatement() const;
|
||||
llvm::Value *getMaskAtSwitchEntry();
|
||||
|
||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
CFInfo *popCFState();
|
||||
|
||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
|
||||
const Type *ptrType, llvm::Value *mask);
|
||||
void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
|
||||
const char *name);
|
||||
void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr,
|
||||
llvm::Value *mask, const Type *valueType,
|
||||
const PointerType *ptrType);
|
||||
llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
|
||||
const PointerType *ptrType, const char *name);
|
||||
|
||||
llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
|
||||
llvm::Value *mask, const char *name);
|
||||
|
||||
llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
|
||||
};
|
||||
|
||||
|
||||
677
decl.cpp
677
decl.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2013, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,12 +28,12 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file decl.cpp
|
||||
@brief Implementations of classes related to turning declarations into
|
||||
symbols and types.
|
||||
@brief Implementations of classes related to turning declarations into
|
||||
symbol names and types.
|
||||
*/
|
||||
|
||||
#include "decl.h"
|
||||
@@ -44,16 +44,50 @@
|
||||
#include "stmt.h"
|
||||
#include "expr.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <set>
|
||||
|
||||
static void
|
||||
lPrintTypeQualifiers(int typeQualifiers) {
|
||||
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
|
||||
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
|
||||
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
|
||||
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
|
||||
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
if (typeQualifiers & TYPEQUAL_EXPORT) printf("export ");
|
||||
if (typeQualifiers & TYPEQUAL_UNMASKED) printf("unmasked ");
|
||||
}
|
||||
|
||||
|
||||
/** Given a Type and a set of type qualifiers, apply the type qualifiers to
|
||||
the type, returning the type that is the result.
|
||||
the type, returning the type that is the result.
|
||||
*/
|
||||
static const Type *
|
||||
lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
if (type == NULL)
|
||||
return NULL;
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
|
||||
if (Type::Equal(type, AtomicType::Void))
|
||||
Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
|
||||
else
|
||||
type = type->GetAsUniformType();
|
||||
}
|
||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
|
||||
if (Type::Equal(type, AtomicType::Void))
|
||||
Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
|
||||
else
|
||||
type = type->GetAsVaryingType();
|
||||
}
|
||||
else
|
||||
if (Type::Equal(type, AtomicType::Void) == false)
|
||||
type = type->GetAsUnboundVariabilityType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
|
||||
Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
|
||||
@@ -62,30 +96,19 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
const Type *unsignedType = type->GetAsUnsignedType();
|
||||
if (unsignedType != NULL)
|
||||
type = unsignedType;
|
||||
else
|
||||
else {
|
||||
const Type *resolvedType =
|
||||
type->ResolveUnboundVariability(Variability::Varying);
|
||||
Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
||||
type->GetString().c_str());
|
||||
|
||||
resolvedType->GetString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
|
||||
const Type *resolvedType =
|
||||
type->ResolveUnboundVariability(Variability::Varying);
|
||||
Error(pos, "\"signed\" qualifier is illegal with non-integer type "
|
||||
"\"%s\".", type->GetString().c_str());
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
|
||||
type = type->GetAsUniformType();
|
||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
|
||||
type = type->GetAsVaryingType();
|
||||
else {
|
||||
// otherwise, structs are uniform by default and everything
|
||||
// else is varying by default
|
||||
if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
|
||||
type = type->GetAsUniformType();
|
||||
else
|
||||
type = type->GetAsVaryingType();
|
||||
"\"%s\".", resolvedType->GetString().c_str());
|
||||
}
|
||||
|
||||
return type;
|
||||
@@ -106,18 +129,59 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
|
||||
|
||||
const Type *
|
||||
DeclSpecs::GetBaseType(SourcePos pos) const {
|
||||
const Type *bt = baseType;
|
||||
const Type *retType = baseType;
|
||||
|
||||
if (retType == NULL) {
|
||||
Warning(pos, "No type specified in declaration. Assuming int32.");
|
||||
retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
|
||||
}
|
||||
|
||||
if (vectorSize > 0) {
|
||||
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
|
||||
const AtomicType *atomicType = CastType<AtomicType>(retType);
|
||||
if (atomicType == NULL) {
|
||||
Error(pos, "Only atomic types (int, float, ...) are legal for vector "
|
||||
"types.");
|
||||
return NULL;
|
||||
}
|
||||
bt = new VectorType(atomicType, vectorSize);
|
||||
retType = new VectorType(atomicType, vectorSize);
|
||||
}
|
||||
|
||||
return lApplyTypeQualifiers(typeQualifiers, bt, pos);
|
||||
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
|
||||
|
||||
if (soaWidth > 0) {
|
||||
const StructType *st = CastType<StructType>(retType);
|
||||
|
||||
if (st == NULL) {
|
||||
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||
"type \"%s\".", soaWidth, retType->GetString().c_str());
|
||||
return NULL;
|
||||
}
|
||||
else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
|
||||
Error(pos, "soa<%d> width illegal. Value must be positive power "
|
||||
"of two.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (st->IsUniformType()) {
|
||||
Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
|
||||
"both be used in a type declaration.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else if (st->IsVaryingType()) {
|
||||
Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
|
||||
"both be used in a type declaration.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
retType = st->GetAsSOAType(soaWidth);
|
||||
|
||||
if (soaWidth < g->target->getVectorWidth())
|
||||
PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
|
||||
"currently leads to inefficient code to access "
|
||||
"soa types.", soaWidth, g->target->getVectorWidth());
|
||||
}
|
||||
|
||||
return retType;
|
||||
}
|
||||
|
||||
|
||||
@@ -127,7 +191,6 @@ lGetStorageClassName(StorageClass storageClass) {
|
||||
case SC_NONE: return "";
|
||||
case SC_EXTERN: return "extern";
|
||||
case SC_EXTERN_C: return "extern \"C\"";
|
||||
case SC_EXPORT: return "export";
|
||||
case SC_STATIC: return "static";
|
||||
case SC_TYPEDEF: return "typedef";
|
||||
default: FATAL("Unhandled storage class in lGetStorageClassName");
|
||||
@@ -138,177 +201,201 @@ lGetStorageClassName(StorageClass storageClass) {
|
||||
|
||||
void
|
||||
DeclSpecs::Print() const {
|
||||
printf("%s ", lGetStorageClassName(storageClass));
|
||||
printf("Declspecs: [%s ", lGetStorageClassName(storageClass));
|
||||
|
||||
if (soaWidth > 0) printf("soa<%d> ", soaWidth);
|
||||
|
||||
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
|
||||
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
|
||||
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
|
||||
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
|
||||
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
|
||||
printf("%s", baseType->GetString().c_str());
|
||||
lPrintTypeQualifiers(typeQualifiers);
|
||||
printf("base type: %s", baseType->GetString().c_str());
|
||||
|
||||
if (vectorSize > 0) printf("<%d>", vectorSize);
|
||||
printf("]");
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Declarator
|
||||
|
||||
Declarator::Declarator(DeclaratorKind dk, SourcePos p)
|
||||
: pos(p), kind(dk) {
|
||||
Declarator::Declarator(DeclaratorKind dk, SourcePos p)
|
||||
: pos(p), kind(dk) {
|
||||
child = NULL;
|
||||
typeQualifiers = 0;
|
||||
storageClass = SC_NONE;
|
||||
arraySize = -1;
|
||||
sym = NULL;
|
||||
type = NULL;
|
||||
initExpr = NULL;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
|
||||
const Type *t = GetType(ds);
|
||||
Symbol *sym = GetSymbol();
|
||||
if (sym != NULL) {
|
||||
sym->type = t;
|
||||
sym->storageClass = ds->storageClass;
|
||||
const Type *baseType = ds->GetBaseType(pos);
|
||||
InitFromType(baseType, ds);
|
||||
|
||||
if (type == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
storageClass = ds->storageClass;
|
||||
|
||||
if (ds->declSpecList.size() > 0 &&
|
||||
CastType<FunctionType>(type) == NULL) {
|
||||
Error(pos, "__declspec specifiers for non-function type \"%s\" are "
|
||||
"not used.", type->GetString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Symbol *
|
||||
Declarator::GetSymbol() const {
|
||||
// The symbol lives at the last child in the chain, so walk down there
|
||||
// and return the one there.
|
||||
const Declarator *d = this;
|
||||
while (d->child != NULL)
|
||||
d = d->child;
|
||||
return d->sym;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declarator::Print() const {
|
||||
Symbol *sym = GetSymbol();
|
||||
if (sym != NULL)
|
||||
printf("%s", sym->name.c_str());
|
||||
Declarator::Print(int indent) const {
|
||||
printf("%*cdeclarator: [", indent, ' ');
|
||||
pos.Print();
|
||||
|
||||
lPrintTypeQualifiers(typeQualifiers);
|
||||
printf("%s ", lGetStorageClassName(storageClass));
|
||||
if (name.size() > 0)
|
||||
printf("%s", name.c_str());
|
||||
else
|
||||
printf("(null symbol)");
|
||||
printf("(unnamed)");
|
||||
|
||||
printf(", array size = %d", arraySize);
|
||||
|
||||
printf(", kind = ");
|
||||
switch (kind) {
|
||||
case DK_BASE: printf("base"); break;
|
||||
case DK_POINTER: printf("pointer"); break;
|
||||
case DK_REFERENCE: printf("reference"); break;
|
||||
case DK_ARRAY: printf("array"); break;
|
||||
case DK_FUNCTION: printf("function"); break;
|
||||
default: FATAL("Unhandled declarator kind");
|
||||
}
|
||||
|
||||
if (initExpr != NULL) {
|
||||
printf(" = (");
|
||||
initExpr->Print();
|
||||
printf(")");
|
||||
}
|
||||
pos.Print();
|
||||
}
|
||||
|
||||
|
||||
Symbol *
|
||||
Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
|
||||
const FunctionType *type =
|
||||
dynamic_cast<const FunctionType *>(GetType(ds));
|
||||
if (type == NULL)
|
||||
return NULL;
|
||||
|
||||
Symbol *declSym = GetSymbol();
|
||||
assert(declSym != NULL);
|
||||
|
||||
// Get the symbol for the function from the symbol table. (It should
|
||||
// already have been added to the symbol table by AddGlobal() by the
|
||||
// time we get here.)
|
||||
Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
|
||||
if (funSym != NULL)
|
||||
// May be NULL due to error earlier in compilation
|
||||
funSym->pos = pos;
|
||||
|
||||
// Walk down to the declarator for the function. (We have to get past
|
||||
// the stuff that specifies the function's return type before we get to
|
||||
// the function's declarator.)
|
||||
Declarator *d = this;
|
||||
while (d != NULL && d->kind != DK_FUNCTION)
|
||||
d = d->child;
|
||||
assert(d != NULL);
|
||||
|
||||
for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
|
||||
Declaration *pdecl = d->functionParams[i];
|
||||
assert(pdecl->declarators.size() == 1);
|
||||
funArgs->push_back(pdecl->declarators[0]->GetSymbol());
|
||||
if (functionParams.size() > 0) {
|
||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
||||
printf("\n%*cfunc param %d:\n", indent, ' ', i);
|
||||
functionParams[i]->Print(indent+4);
|
||||
}
|
||||
}
|
||||
|
||||
return funSym;
|
||||
if (child != NULL)
|
||||
child->Print(indent + 4);
|
||||
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
void
|
||||
Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
||||
bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
|
||||
bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
|
||||
bool isTask = ((typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isExported = ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
||||
bool isConst = ((typeQualifiers & TYPEQUAL_CONST) != 0);
|
||||
bool isUnmasked = ((typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
||||
|
||||
if (hasUniformQual && hasVaryingQual) {
|
||||
Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isTask)
|
||||
if (kind != DK_FUNCTION && isTask) {
|
||||
Error(pos, "\"task\" qualifier illegal in variable declaration.");
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isUnmasked) {
|
||||
Error(pos, "\"unmasked\" qualifier illegal in variable declaration.");
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isExported) {
|
||||
Error(pos, "\"export\" qualifier illegal in variable declaration.");
|
||||
return;
|
||||
}
|
||||
|
||||
const Type *type = base;
|
||||
switch (kind) {
|
||||
case DK_BASE:
|
||||
Variability variability(Variability::Unbound);
|
||||
if (hasUniformQual)
|
||||
variability = Variability::Uniform;
|
||||
else if (hasVaryingQual)
|
||||
variability = Variability::Varying;
|
||||
|
||||
if (kind == DK_BASE) {
|
||||
// All of the type qualifiers should be in the DeclSpecs for the
|
||||
// base declarator
|
||||
assert(typeQualifiers == 0);
|
||||
assert(child == NULL);
|
||||
return type;
|
||||
|
||||
case DK_POINTER:
|
||||
type = new PointerType(type, hasUniformQual, isConst);
|
||||
if (child != NULL)
|
||||
return child->GetType(type, ds);
|
||||
AssertPos(pos, typeQualifiers == 0);
|
||||
AssertPos(pos, child == NULL);
|
||||
type = baseType;
|
||||
}
|
||||
else if (kind == DK_POINTER) {
|
||||
/* For now, any pointer to an SOA type gets the slice property; if
|
||||
we add the capability to declare pointers as slices or not,
|
||||
we'll want to set this based on a type qualifier here. */
|
||||
const Type *ptrType = new PointerType(baseType, variability, isConst,
|
||||
baseType->IsSOAType());
|
||||
if (child != NULL) {
|
||||
child->InitFromType(ptrType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
|
||||
case DK_REFERENCE:
|
||||
if (hasUniformQual)
|
||||
type = ptrType;
|
||||
}
|
||||
else if (kind == DK_REFERENCE) {
|
||||
if (hasUniformQual) {
|
||||
Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
|
||||
if (hasVaryingQual)
|
||||
return;
|
||||
}
|
||||
if (hasVaryingQual) {
|
||||
Error(pos, "\"varying\" qualifier is illegal to apply to references.");
|
||||
if (isConst)
|
||||
return;
|
||||
}
|
||||
if (isConst) {
|
||||
Error(pos, "\"const\" qualifier is to illegal apply to references.");
|
||||
|
||||
return;
|
||||
}
|
||||
// The parser should disallow this already, but double check.
|
||||
if (dynamic_cast<const ReferenceType *>(type) != NULL) {
|
||||
if (CastType<ReferenceType>(baseType) != NULL) {
|
||||
Error(pos, "References to references are illegal.");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
type = new ReferenceType(type);
|
||||
if (child != NULL)
|
||||
return child->GetType(type, ds);
|
||||
const Type *refType = new ReferenceType(baseType);
|
||||
if (child != NULL) {
|
||||
child->InitFromType(refType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
type = refType;
|
||||
}
|
||||
else if (kind == DK_ARRAY) {
|
||||
if (Type::Equal(baseType, AtomicType::Void)) {
|
||||
Error(pos, "Arrays of \"void\" type are illegal.");
|
||||
return;
|
||||
}
|
||||
if (CastType<ReferenceType>(baseType)) {
|
||||
Error(pos, "Arrays of references (type \"%s\") are illegal.",
|
||||
baseType->GetString().c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
case DK_ARRAY:
|
||||
type = new ArrayType(type, arraySize);
|
||||
if (child)
|
||||
return child->GetType(type, ds);
|
||||
const Type *arrayType = new ArrayType(baseType, arraySize);
|
||||
if (child != NULL) {
|
||||
child->InitFromType(arrayType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
|
||||
case DK_FUNCTION: {
|
||||
std::vector<const Type *> args;
|
||||
std::vector<std::string> argNames;
|
||||
std::vector<ConstExpr *> argDefaults;
|
||||
std::vector<SourcePos> argPos;
|
||||
type = arrayType;
|
||||
}
|
||||
else if (kind == DK_FUNCTION) {
|
||||
llvm::SmallVector<const Type *, 8> args;
|
||||
llvm::SmallVector<std::string, 8> argNames;
|
||||
llvm::SmallVector<Expr *, 8> argDefaults;
|
||||
llvm::SmallVector<SourcePos, 8> argPos;
|
||||
|
||||
// Loop over the function arguments and store the names, types,
|
||||
// default values (if any), and source file positions each one in
|
||||
@@ -316,33 +403,44 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
||||
Declaration *d = functionParams[i];
|
||||
|
||||
char buf[32];
|
||||
Symbol *sym;
|
||||
if (d == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for
|
||||
// the parameter
|
||||
// function declaration like foo(float), w/o a name for the
|
||||
// parameter; wire up a placeholder Declarator for it
|
||||
d->declarators.push_back(new Declarator(DK_BASE, pos));
|
||||
d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
|
||||
}
|
||||
|
||||
AssertPos(pos, d->declarators.size() == 1);
|
||||
Declarator *decl = d->declarators[0];
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (decl->name == "") {
|
||||
// Give a name to any anonymous parameter declarations
|
||||
char buf[32];
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, pos);
|
||||
sym->type = d->declSpecs->GetBaseType(pos);
|
||||
}
|
||||
else {
|
||||
sym = d->declarators[0]->GetSymbol();
|
||||
if (sym == NULL) {
|
||||
// Handle more complex anonymous declarations like
|
||||
// float (float **).
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, d->declarators[0]->pos);
|
||||
sym->type = d->declarators[0]->GetType(d->declSpecs);
|
||||
}
|
||||
decl->name = buf;
|
||||
}
|
||||
decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
|
||||
|
||||
if (d->declSpecs->storageClass != SC_NONE)
|
||||
Error(sym->pos, "Storage class \"%s\" is illegal in "
|
||||
"function parameter declaration for parameter \"%s\".",
|
||||
Error(decl->pos, "Storage class \"%s\" is illegal in "
|
||||
"function parameter declaration for parameter \"%s\".",
|
||||
lGetStorageClassName(d->declSpecs->storageClass),
|
||||
sym->name.c_str());
|
||||
decl->name.c_str());
|
||||
if (Type::Equal(decl->type, AtomicType::Void)) {
|
||||
Error(decl->pos, "Parameter with type \"void\" illegal in function "
|
||||
"parameter list.");
|
||||
decl->type = NULL;
|
||||
}
|
||||
|
||||
const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
|
||||
const ArrayType *at = CastType<ArrayType>(decl->type);
|
||||
if (at != NULL) {
|
||||
// As in C, arrays are passed to functions as pointers to
|
||||
// their element type. We'll just immediately make this
|
||||
@@ -352,115 +450,125 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
// report this differently than it was originally declared
|
||||
// in the function, but it's not clear that this is a
|
||||
// significant problem.)
|
||||
sym->type = PointerType::GetUniform(at->GetElementType());
|
||||
const Type *targetType = at->GetElementType();
|
||||
if (targetType == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
decl->type = PointerType::GetUniform(targetType, at->IsSOAType());
|
||||
|
||||
// Make sure there are no unsized arrays (other than the
|
||||
// first dimension) in function parameter lists.
|
||||
at = dynamic_cast<const ArrayType *>(at->GetElementType());
|
||||
at = CastType<ArrayType>(targetType);
|
||||
while (at != NULL) {
|
||||
if (at->GetElementCount() == 0)
|
||||
Error(sym->pos, "Arrays with unsized dimensions in "
|
||||
Error(decl->pos, "Arrays with unsized dimensions in "
|
||||
"dimensions after the first one are illegal in "
|
||||
"function parameter lists.");
|
||||
at = dynamic_cast<const ArrayType *>(at->GetElementType());
|
||||
at = CastType<ArrayType>(at->GetElementType());
|
||||
}
|
||||
}
|
||||
|
||||
args.push_back(sym->type);
|
||||
argNames.push_back(sym->name);
|
||||
argPos.push_back(sym->pos);
|
||||
args.push_back(decl->type);
|
||||
argNames.push_back(decl->name);
|
||||
argPos.push_back(decl->pos);
|
||||
|
||||
ConstExpr *init = NULL;
|
||||
if (d->declarators.size()) {
|
||||
// Try to find an initializer expression; if there is one,
|
||||
// it lives down to the base declarator.
|
||||
Declarator *decl = d->declarators[0];
|
||||
while (decl->child != NULL) {
|
||||
assert(decl->initExpr == NULL);
|
||||
Expr *init = NULL;
|
||||
// Try to find an initializer expression.
|
||||
while (decl != NULL) {
|
||||
if (decl->initExpr != NULL) {
|
||||
decl->initExpr = TypeCheck(decl->initExpr);
|
||||
decl->initExpr = Optimize(decl->initExpr);
|
||||
if (decl->initExpr != NULL) {
|
||||
init = dynamic_cast<ConstExpr *>(decl->initExpr);
|
||||
if (init == NULL)
|
||||
init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
|
||||
if (init == NULL)
|
||||
Error(decl->initExpr->pos, "Default value for parameter "
|
||||
"\"%s\" must be a compile-time constant.",
|
||||
decl->name.c_str());
|
||||
}
|
||||
break;
|
||||
}
|
||||
else
|
||||
decl = decl->child;
|
||||
}
|
||||
|
||||
if (decl->initExpr != NULL &&
|
||||
(decl->initExpr = decl->initExpr->TypeCheck()) != NULL &&
|
||||
(decl->initExpr = decl->initExpr->Optimize()) != NULL &&
|
||||
(init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
|
||||
Error(decl->initExpr->pos, "Default value for parameter "
|
||||
"\"%s\" must be a compile-time constant.",
|
||||
sym->name.c_str());
|
||||
}
|
||||
}
|
||||
argDefaults.push_back(init);
|
||||
}
|
||||
|
||||
const Type *returnType = type;
|
||||
const Type *returnType = baseType;
|
||||
if (returnType == NULL) {
|
||||
Error(pos, "No return type provided in function declaration.");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
bool isExported = ds && (ds->storageClass == SC_EXPORT);
|
||||
if (CastType<FunctionType>(returnType) != NULL) {
|
||||
Error(pos, "Illegal to return function type from function.");
|
||||
return;
|
||||
}
|
||||
|
||||
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
|
||||
|
||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
||||
bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
||||
|
||||
if (isExported && isTask) {
|
||||
Error(pos, "Function can't have both \"task\" and \"export\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (isExternC && isTask) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (isExternC && isExported) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (isUnmasked && isExported)
|
||||
Warning(pos, "\"unmasked\" qualifier is redundant for exported "
|
||||
"functions.");
|
||||
|
||||
if (child == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
Type *functionType =
|
||||
new FunctionType(returnType, args, pos, argNames, argDefaults,
|
||||
argPos, isTask, isExported, isExternC);
|
||||
return child->GetType(functionType, ds);
|
||||
const FunctionType *functionType =
|
||||
new FunctionType(returnType, args, argNames, argDefaults,
|
||||
argPos, isTask, isExported, isExternC, isUnmasked);
|
||||
|
||||
// handle any explicit __declspecs on the function
|
||||
if (ds != NULL) {
|
||||
for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
|
||||
std::string str = ds->declSpecList[i].first;
|
||||
SourcePos pos = ds->declSpecList[i].second;
|
||||
|
||||
if (str == "safe")
|
||||
(const_cast<FunctionType *>(functionType))->isSafe = true;
|
||||
else if (!strncmp(str.c_str(), "cost", 4)) {
|
||||
int cost = atoi(str.c_str() + 4);
|
||||
if (cost < 0)
|
||||
Error(pos, "Negative function cost %d is illegal.",
|
||||
cost);
|
||||
(const_cast<FunctionType *>(functionType))->costOverride = cost;
|
||||
}
|
||||
else
|
||||
Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
child->InitFromType(functionType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
default:
|
||||
FATAL("Unexpected decl kind");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Make sure we actually have an array of structs ..
|
||||
const StructType *childStructType =
|
||||
dynamic_cast<const StructType *>(childType);
|
||||
if (childStructType == NULL) {
|
||||
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||
"type \"%s\".", soaWidth, childType->GetString().c_str());
|
||||
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
||||
}
|
||||
else if ((soaWidth & (soaWidth - 1)) != 0) {
|
||||
Error(pos, "soa<%d> width illegal. Value must be power of two.",
|
||||
soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
|
||||
Error(pos, "soa<%d> width must evenly divide array size %d.",
|
||||
soaWidth, arraySize);
|
||||
return NULL;
|
||||
}
|
||||
return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
|
||||
soaWidth);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Declarator::GetType(DeclSpecs *ds) const {
|
||||
const Type *baseType = ds->GetBaseType(pos);
|
||||
const Type *type = GetType(baseType, ds);
|
||||
return type;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Declaration
|
||||
|
||||
@@ -485,49 +593,72 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
|
||||
|
||||
std::vector<VariableDeclaration>
|
||||
Declaration::GetVariableDeclarations() const {
|
||||
assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||
Assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||
std::vector<VariableDeclaration> vars;
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
if (declarators[i] == NULL)
|
||||
continue;
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL)
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
// Ignore earlier errors
|
||||
Assert(m->errorCount > 0);
|
||||
continue;
|
||||
|
||||
Symbol *sym = decl->GetSymbol();
|
||||
if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
|
||||
// function declaration
|
||||
m->symbolTable->AddFunction(sym);
|
||||
}
|
||||
else {
|
||||
|
||||
if (Type::Equal(decl->type, AtomicType::Void))
|
||||
Error(decl->pos, "\"void\" type variable illegal in declaration.");
|
||||
else if (CastType<FunctionType>(decl->type) == NULL) {
|
||||
decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
|
||||
Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
|
||||
decl->storageClass);
|
||||
m->symbolTable->AddVariable(sym);
|
||||
vars.push_back(VariableDeclaration(sym, decl->initExpr));
|
||||
}
|
||||
}
|
||||
|
||||
return vars;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declaration::Print() const {
|
||||
printf("Declaration: specs [");
|
||||
declSpecs->Print();
|
||||
printf("], declarators [");
|
||||
for (unsigned int i = 0 ; i < declarators.size(); ++i) {
|
||||
declarators[i]->Print();
|
||||
printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
|
||||
Declaration::DeclareFunctions() {
|
||||
Assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
// Ignore earlier errors
|
||||
Assert(m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
const FunctionType *ftype = CastType<FunctionType>(decl->type);
|
||||
if (ftype == NULL)
|
||||
continue;
|
||||
|
||||
bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
|
||||
m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
|
||||
isInline, decl->pos);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declaration::Print(int indent) const {
|
||||
printf("%*cDeclaration: specs [", indent, ' ');
|
||||
declSpecs->Print();
|
||||
printf("], declarators:\n");
|
||||
for (unsigned int i = 0 ; i < declarators.size(); ++i)
|
||||
declarators[i]->Print(indent+4);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void
|
||||
GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions) {
|
||||
llvm::SmallVector<const Type *, 8> *elementTypes,
|
||||
llvm::SmallVector<std::string, 8> *elementNames,
|
||||
llvm::SmallVector<SourcePos, 8> *elementPositions) {
|
||||
std::set<std::string> seenNames;
|
||||
for (unsigned int i = 0; i < sd.size(); ++i) {
|
||||
const Type *type = sd[i]->type;
|
||||
@@ -537,35 +668,41 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
// FIXME: making this fake little DeclSpecs here is really
|
||||
// disgusting
|
||||
DeclSpecs ds(type);
|
||||
if (type->IsUniformType())
|
||||
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
|
||||
else
|
||||
ds.typeQualifiers |= TYPEQUAL_VARYING;
|
||||
if (Type::Equal(type, AtomicType::Void) == false) {
|
||||
if (type->IsUniformType())
|
||||
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
|
||||
else if (type->IsVaryingType())
|
||||
ds.typeQualifiers |= TYPEQUAL_VARYING;
|
||||
else if (type->GetSOAWidth() != 0)
|
||||
ds.soaWidth = type->GetSOAWidth();
|
||||
// FIXME: ds.vectorSize?
|
||||
}
|
||||
|
||||
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
|
||||
Declarator *d = (*sd[i]->declarators)[j];
|
||||
d->InitFromDeclSpecs(&ds);
|
||||
|
||||
Symbol *sym = d->GetSymbol();
|
||||
if (Type::Equal(d->type, AtomicType::Void))
|
||||
Error(d->pos, "\"void\" type illegal for struct member.");
|
||||
|
||||
const ArrayType *arrayType =
|
||||
dynamic_cast<const ArrayType *>(sym->type);
|
||||
if (arrayType != NULL && arrayType->GetElementCount() == 0) {
|
||||
Error(d->pos, "Unsized arrays aren't allowed in struct "
|
||||
"definitions.");
|
||||
elementTypes->push_back(NULL);
|
||||
}
|
||||
else
|
||||
elementTypes->push_back(sym->type);
|
||||
elementTypes->push_back(d->type);
|
||||
|
||||
if (seenNames.find(sym->name) != seenNames.end())
|
||||
if (seenNames.find(d->name) != seenNames.end())
|
||||
Error(d->pos, "Struct member \"%s\" has same name as a "
|
||||
"previously-declared member.", sym->name.c_str());
|
||||
"previously-declared member.", d->name.c_str());
|
||||
else
|
||||
seenNames.insert(sym->name);
|
||||
seenNames.insert(d->name);
|
||||
|
||||
elementNames->push_back(sym->name);
|
||||
elementPositions->push_back(sym->pos);
|
||||
elementNames->push_back(d->name);
|
||||
elementPositions->push_back(d->pos);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
|
||||
const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
|
||||
|
||||
if (arrayType != NULL && arrayType->GetElementCount() == 0)
|
||||
Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
|
||||
"for the last member in a struct definition.");
|
||||
}
|
||||
}
|
||||
|
||||
72
decl.h
72
decl.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file decl.h
|
||||
@@ -47,30 +47,21 @@
|
||||
variables--here, that the declaration has the 'static' and 'uniform'
|
||||
qualifiers, and that it's basic type is 'int'. Then for each variable
|
||||
declaration, the Declaraiton class holds an instance of a Declarator,
|
||||
which in turn records the per-variable information like the symbol
|
||||
name, array size (if any), initializer expression, etc.
|
||||
which in turn records the per-variable information like the name, array
|
||||
size (if any), initializer expression, etc.
|
||||
*/
|
||||
|
||||
#ifndef ISPC_DECL_H
|
||||
#define ISPC_DECL_H
|
||||
|
||||
#include "ispc.h"
|
||||
#include <llvm/ADT/SmallVector.h>
|
||||
|
||||
struct VariableDeclaration;
|
||||
|
||||
class Declaration;
|
||||
class Declarator;
|
||||
|
||||
enum StorageClass {
|
||||
SC_NONE,
|
||||
SC_EXTERN,
|
||||
SC_EXPORT,
|
||||
SC_STATIC,
|
||||
SC_TYPEDEF,
|
||||
SC_EXTERN_C
|
||||
};
|
||||
|
||||
|
||||
/* Multiple qualifiers can be provided with types in declarations;
|
||||
therefore, they are set up so that they can be ANDed together into an
|
||||
int. */
|
||||
@@ -82,6 +73,8 @@ enum StorageClass {
|
||||
#define TYPEQUAL_SIGNED (1<<4)
|
||||
#define TYPEQUAL_UNSIGNED (1<<5)
|
||||
#define TYPEQUAL_INLINE (1<<6)
|
||||
#define TYPEQUAL_EXPORT (1<<7)
|
||||
#define TYPEQUAL_UNMASKED (1<<8)
|
||||
|
||||
/** @brief Representation of the declaration specifiers in a declaration.
|
||||
|
||||
@@ -90,7 +83,8 @@ enum StorageClass {
|
||||
*/
|
||||
class DeclSpecs {
|
||||
public:
|
||||
DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
|
||||
DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
|
||||
int tq = TYPEQUAL_NONE);
|
||||
|
||||
void Print() const;
|
||||
|
||||
@@ -117,6 +111,8 @@ public:
|
||||
SOA width specified. Otherwise this is zero.
|
||||
*/
|
||||
int soaWidth;
|
||||
|
||||
std::vector<std::pair<std::string, SourcePos> > declSpecList;
|
||||
};
|
||||
|
||||
|
||||
@@ -128,7 +124,7 @@ enum DeclaratorKind {
|
||||
DK_FUNCTION
|
||||
};
|
||||
|
||||
/** @brief Representation of the declaration of a single variable.
|
||||
/** @brief Representation of the declaration of a single variable.
|
||||
|
||||
In conjunction with an instance of the DeclSpecs, this gives us
|
||||
everything we need for a full variable declaration.
|
||||
@@ -138,25 +134,13 @@ public:
|
||||
Declarator(DeclaratorKind dk, SourcePos p);
|
||||
|
||||
/** Once a DeclSpecs instance is available, this method completes the
|
||||
initialization of the Symbol, setting its Type accordingly.
|
||||
initialization of the type member.
|
||||
*/
|
||||
void InitFromDeclSpecs(DeclSpecs *ds);
|
||||
|
||||
/** Get the actual type of the combination of Declarator and the given
|
||||
DeclSpecs. If an explicit base type is provided, the declarator is
|
||||
applied to that type; otherwise the base type from the DeclSpecs is
|
||||
used. */
|
||||
const Type *GetType(DeclSpecs *ds) const;
|
||||
const Type *GetType(const Type *base, DeclSpecs *ds) const;
|
||||
void InitFromType(const Type *base, DeclSpecs *ds);
|
||||
|
||||
/** Returns the symbol corresponding to the function declared by this
|
||||
declarator and symbols for its arguments in *args. */
|
||||
Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
|
||||
|
||||
/** Returns the symbol associated with the declarator. */
|
||||
Symbol *GetSymbol() const;
|
||||
|
||||
void Print() const;
|
||||
void Print(int indent) const;
|
||||
|
||||
/** Position of the declarator in the source program. */
|
||||
const SourcePos pos;
|
||||
@@ -175,18 +159,24 @@ public:
|
||||
/** Type qualifiers provided with the declarator. */
|
||||
int typeQualifiers;
|
||||
|
||||
StorageClass storageClass;
|
||||
|
||||
/** For array declarators, this gives the declared size of the array.
|
||||
Unsized arrays have arraySize == 0. */
|
||||
Unsized arrays have arraySize == 0. */
|
||||
int arraySize;
|
||||
|
||||
/** Symbol associated with the declarator. */
|
||||
Symbol *sym;
|
||||
/** Name associated with the declarator. */
|
||||
std::string name;
|
||||
|
||||
/** Initialization expression for the variable. May be NULL. */
|
||||
Expr *initExpr;
|
||||
|
||||
/** Type of the declarator. This is NULL until InitFromDeclSpecs() or
|
||||
InitFromType() is called. */
|
||||
const Type *type;
|
||||
|
||||
/** For function declarations, this holds the Declaration *s for the
|
||||
funciton's parameters. */
|
||||
function's parameters. */
|
||||
std::vector<Declaration *> functionParams;
|
||||
};
|
||||
|
||||
@@ -199,7 +189,7 @@ public:
|
||||
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
|
||||
Declaration(DeclSpecs *ds, Declarator *d);
|
||||
|
||||
void Print() const;
|
||||
void Print(int indent) const;
|
||||
|
||||
/** This method walks through all of the Declarators in a declaration
|
||||
and returns a fully-initialized Symbol and (possibly) and
|
||||
@@ -208,6 +198,10 @@ public:
|
||||
Declarator representation.) */
|
||||
std::vector<VariableDeclaration> GetVariableDeclarations() const;
|
||||
|
||||
/** For any function declarations in the Declaration, add the
|
||||
declaration to the module. */
|
||||
void DeclareFunctions();
|
||||
|
||||
DeclSpecs *declSpecs;
|
||||
std::vector<Declarator *> declarators;
|
||||
};
|
||||
@@ -227,8 +221,8 @@ struct StructDeclaration {
|
||||
/** Given a set of StructDeclaration instances, this returns the types of
|
||||
the elements of the corresponding struct and their names. */
|
||||
extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions);
|
||||
llvm::SmallVector<const Type *, 8> *elementTypes,
|
||||
llvm::SmallVector<std::string, 8> *elementNames,
|
||||
llvm::SmallVector<SourcePos, 8> *elementPositions);
|
||||
|
||||
#endif // ISPC_DECL_H
|
||||
|
||||
@@ -1,3 +1,334 @@
|
||||
=== v1.4.1 === (28 May 2013)
|
||||
|
||||
A major new version of ispc has been released with stability and performance
|
||||
improvements on all supported platforms (Windows, Linux and MacOS).
|
||||
This version supports LLVM 3.1, 3.2, 3.3 and 3.4. The released binaries are built with 3.2.
|
||||
|
||||
New compiler features:
|
||||
|
||||
* ISPC memory allocation returns aligned memory with platform natural alignment
|
||||
of vector registers by default. Alignment can also be managed via
|
||||
--force-alignment=<value>.
|
||||
|
||||
Important bug fixes/changes:
|
||||
|
||||
* ISPC was fixed to be fully functional when built by GCC 4.7.
|
||||
|
||||
* Major cleanup of build and test scripts on Windows.
|
||||
|
||||
* Gather/scatter performance improvements on Xeon Phi.
|
||||
|
||||
* FMA instructions are enabled for AVX2 instruction set.
|
||||
|
||||
* Support of RDRAND instruction when availible via library function rdrand (Ivy Bridge).
|
||||
|
||||
Release also contains numerous bug fixes and minor improvements.
|
||||
|
||||
=== v1.3.0 === (29 June 2012)
|
||||
|
||||
This is a major new release of ispc, with support for more compilation
|
||||
targets and a number of additions to the language. As usual, the quality
|
||||
of generated code has also been improved in a number of cases and a number
|
||||
of small bugs have been fixed.
|
||||
|
||||
New targets:
|
||||
|
||||
* This release provides "beta" support for compiling to Intel® Xeon
|
||||
Phi™ processor, code named Knights Corner, the first processor in
|
||||
the Intel® Many Integrated Core Architecture. See
|
||||
http://ispc.github.com/ispc.html#compiling-for-the-intel-xeon-phi-architecture
|
||||
for more details on this support.
|
||||
|
||||
* This release also has an "avx1.1" target, which provides support for the
|
||||
new instructions in the Intel Ivy Bridge microarchitecutre.
|
||||
|
||||
New language features:
|
||||
|
||||
* The foreach_active statement allows iteration over the active program
|
||||
instances in a gang. (See
|
||||
http://ispc.github.com/ispc.html#iteration-over-active-program-instances-foreach-active)
|
||||
|
||||
* foreach_unique allows iterating over subsets of program instances in a
|
||||
gang that share the same value of a variable. (See
|
||||
http://ispc.github.com/ispc.html#iteration-over-unique-elements-foreach-unique)
|
||||
|
||||
* An "unmasked" function qualifier and statement in the language allow
|
||||
re-activating execution of all program instances in a gang. (See
|
||||
http://ispc.github.com/ispc.html#re-establishing-the-execution-mask
|
||||
|
||||
Standard library updates:
|
||||
|
||||
* The seed_rng() function has been modified to take a "varying" seed value
|
||||
when a varying RNGState is being initialized.
|
||||
|
||||
* An isnan() function has been added, to check for floating-point "not a
|
||||
number" values.
|
||||
|
||||
* The float_to_srgb8() routine does high performance conversion of
|
||||
floating-point color values to SRGB8 format.
|
||||
|
||||
Other changes:
|
||||
|
||||
* A number of bugfixes have been made for compiler crashes with malformed
|
||||
programs.
|
||||
|
||||
* Floating-point comparisons are now "unordered", so that any comparison
|
||||
where one of the operands is a "not a number" value returns false. (This
|
||||
matches standard IEEE floating-point behavior.)
|
||||
|
||||
* The code generated for 'break' statements in "varying" loops has been
|
||||
improved for some common cases.
|
||||
|
||||
* Compile time and compiler memory use have both been improved,
|
||||
particularly for large input programs.
|
||||
|
||||
* A nubmer of bugs have been fixed in the debugging information generated
|
||||
by the compiler when the "-g" command-line flag is used.
|
||||
|
||||
=== v1.2.2 === (20 April 2012)
|
||||
|
||||
This release includes a number of small additions to functionality and a
|
||||
number of bugfixes. New functionality includes:
|
||||
|
||||
* It's now possible to forward declare structures as in C/C++: "struct
|
||||
Foo;". After such a declaration, structs with pointers to "Foo" and
|
||||
functions that take pointers or references to Foo structs can be declared
|
||||
without the entire definition of Foo being available.
|
||||
|
||||
* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
|
||||
corresponding to the equivalent types in C.
|
||||
|
||||
* The standard library now provides atomic_swap*() and
|
||||
atomic_compare_exchange*() functions for void * types.
|
||||
|
||||
* The C++ backend has seen a number of improvements to the quality and
|
||||
readability of generated code.
|
||||
|
||||
A number of bugs have been fixed in this release as well. The most
|
||||
significant are:
|
||||
|
||||
* Fixed a bug where nested loops could cause a compiler crash in some
|
||||
circumstances (issues #240, and #229)
|
||||
|
||||
* Gathers could access invlaid mamory (and cause the program to crash) in
|
||||
some circumstances (#235)
|
||||
|
||||
* References to temporary values are now handled properly when passed to a
|
||||
function that takes a reference typed parameter.
|
||||
|
||||
* A case where incorrect code could be generated for compile-time-constant
|
||||
initializers has been fixed (#234).
|
||||
|
||||
=== v1.2.1 === (6 April 2012)
|
||||
|
||||
This release contains only minor new functionality and is mostly for many
|
||||
small bugfixes and improvements to error handling and error reporting.
|
||||
The new functionality that is present is:
|
||||
|
||||
* Significantly more efficient versions of the float / half conversion
|
||||
routines are now available in the standard library, thanks to Fabian
|
||||
Giesen.
|
||||
|
||||
* The last member of a struct can now be a zero-length array; this allows
|
||||
the trick of dynamically allocating enough storage for the struct and
|
||||
some number of array elements at the end of it.
|
||||
|
||||
Significant bugs fixed include:
|
||||
|
||||
* Issue #205: When a target ISA isn't specified, use the host system's
|
||||
capabilities to choose a target for which it will be able to run the
|
||||
generated code.
|
||||
|
||||
* Issues #215 and #217: Don't allocate storage for global variables that
|
||||
are declared "extern".
|
||||
|
||||
* Issue #197: Allow NULL as a default argument value in a function
|
||||
declaration.
|
||||
|
||||
* Issue #223: Fix bugs where taking the address of a function wouldn't work
|
||||
as expected.
|
||||
|
||||
* Issue #224: When there are overloaded variants of a function that take
|
||||
both reference and const reference parameters, give the non-const
|
||||
reference preference when matching values of that underlying type.
|
||||
|
||||
* Issue #225: An error is issed when a varying lvalue is assigned to a
|
||||
reference type (rather than crashing).
|
||||
|
||||
* Issue #193: Permit conversions from array types to void *, not just the
|
||||
pointer type of the underlying array element.
|
||||
|
||||
* Issue #199: Still evaluate expressions that are cast to (void).
|
||||
|
||||
The documentation has also been improved, with FAQs added to clarify some
|
||||
aspects of the ispc pointer model.
|
||||
|
||||
=== v1.2.0 === (20 March 2012)
|
||||
|
||||
This is a major new release of ispc, with a number of significant
|
||||
improvements to functionality, performance, and compiler robustness. It
|
||||
does, however, include three small changes to language syntax and semantics
|
||||
that may require changes to existing programs:
|
||||
|
||||
* Syntax for the "launch" keyword has been cleaned up; it's now no longer
|
||||
necessary to bracket the launched function call with angle brackets.
|
||||
(In other words, now use "launch foo();", rather than "launch < foo() >;".
|
||||
|
||||
* When using pointers, the pointed-to data type is now "uniform" by
|
||||
default. Use the varying keyword to specify varying pointed-to types when
|
||||
needed. (i.e. "float *ptr" is a varying pointer to uniform float data,
|
||||
whereas previously it was a varying pointer to varying float values.)
|
||||
Use "varying float *" to specify a varying pointer to varying float data,
|
||||
and so forth.
|
||||
|
||||
* The details of "uniform" and "varying" and how they interact with struct
|
||||
types have been cleaned up. Now, when a struct type is declared, if the
|
||||
struct elements don't have explicit "uniform" or "varying" qualifiers,
|
||||
they are said to have "unbound" variability. When a struct type is
|
||||
instantiated, any unbound variability elements inherit the variability of
|
||||
the parent struct type. See http://ispc.github.com/ispc.html#struct-types
|
||||
for more details.
|
||||
|
||||
ispc has a new language feature that makes it much easier to use the
|
||||
efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
|
||||
data. A new "soa<n>" qualifier can be applied to structure types to
|
||||
specify an n-wide SoA version of the corresponding type. Array indexing
|
||||
and pointer operations with arrays SoA types automatically handles the
|
||||
two-stage indexing calculation to access the data. See
|
||||
http://ispc.github.com/ispc.html#structure-of-array-types for more details.
|
||||
|
||||
For more efficient access of data that is still in "array of structures"
|
||||
(AoS) format, ispc has a new "memory coalescing" optimization that
|
||||
automatically detects series of strided loads and/or gathers that can be
|
||||
transformed into a more efficient set of vector loads and shuffles. A
|
||||
diagnostic is emitted when this optimization is successfully applied.
|
||||
|
||||
Smaller changes in this release:
|
||||
|
||||
* The standard library now provides memcpy(), memmove() and memset()
|
||||
functions, as well as single-precision asin() and acos() functions.
|
||||
|
||||
* -I can now be specified on the command-line to specify a search path for
|
||||
#include files.
|
||||
|
||||
* A number of improvements have been made to error reporting from the
|
||||
parser, and a number of cases where malformed programs could cause the
|
||||
compiler to crash have been fixed.
|
||||
|
||||
* A number of small improvements to the quality and performance of generated
|
||||
code have been made, including finding more cases where 32-bit addressing
|
||||
calculations can be safely done on 64-bit systems and generating better
|
||||
code for initializer expressions.
|
||||
|
||||
=== v1.1.4 === (4 February 2012)
|
||||
|
||||
There are two major bugfixes for Windows in this release. First, a number
|
||||
of failures in AVX code generation on Windows have been fixed; AVX on
|
||||
Windows now has no known issues. Second, a longstanding bug in parsing 64-bit
|
||||
integer constants on Windows has been fixed.
|
||||
|
||||
This release features a new experimental scalar target, contributed by Gabe
|
||||
Weisz <gweisz@cs.cmu.edu>. This target ("--target=generic-1") compiles
|
||||
gangs of single program instances (i.e. programCount == 1); it can be
|
||||
useful for debugging ispc programs.
|
||||
|
||||
The compiler now supports dynamic memory allocation in ispc programs (with
|
||||
"new" and "delete" operators based on C++). See
|
||||
http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
|
||||
documentation for more information.
|
||||
|
||||
ispc now performs "short circuit" evaluation of the || and && logical
|
||||
operators and the ? : selection operator. (This represents the correction
|
||||
of a major incompatibility with C.) Code like "(index < arraySize &&
|
||||
array[index] == 1)" thus now executes as in C, where "array[index]" won't
|
||||
be evaluated unless "index" is less than "arraySize".
|
||||
|
||||
The standard library now provides "local" atomic operations, which are
|
||||
atomic across the gang of program instances (but not across other gangs or
|
||||
other hardware threads. See the updated documentation on atomics for more
|
||||
information:
|
||||
http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
|
||||
|
||||
The standard library now offers a clock() function, which returns a uniform
|
||||
int64 value that counts processor cycles; it can be used for
|
||||
fine-resolution timing measurements.
|
||||
|
||||
Finally (of limited interest now): ispc now supports the forthcoming AVX2
|
||||
instruction set, due with Haswell-generation CPUs. All tests and examples
|
||||
compile and execute correctly with AVX2. (Thanks specifically to Craig
|
||||
Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
|
||||
possible.)
|
||||
|
||||
=== v1.1.3 === (20 January 2012)
|
||||
|
||||
With this release, the language now supports "switch" statements, with the
|
||||
same semantics and syntax as in C.
|
||||
|
||||
This release includes fixes for two important performance related issues:
|
||||
the quality of code generated for "foreach" statements has been
|
||||
substantially improved (https://github.com/ispc/ispc/issues/151), and a
|
||||
performance regression with code for "gathers" that was introduced in
|
||||
v1.1.2 has been fixed in this release.
|
||||
|
||||
A number of other small bugs were fixed in this release as well, including
|
||||
one where invalid memory would sometimes be incorrectly accessed
|
||||
(https://github.com/ispc/ispc/issues/160).
|
||||
|
||||
Thanks to Jean-Luc Duprat for a number of patches that improve support for
|
||||
building on various platforms, and to Pierre-Antoine Lacaze for patches so
|
||||
that ispc builds under MinGW.
|
||||
|
||||
=== v1.1.2 === (9 January 2012)
|
||||
|
||||
The major new feature in this release is support for "generic" C++
|
||||
vectorized output; in other words, ispc can emit C++ code that corresponds
|
||||
to the vectorized computation that the ispc program represents. See the
|
||||
examples/intrinsics directory in the ispc distribution for two example
|
||||
implementations of the set of functions that must be provided map the
|
||||
vector calls generated by ispc to target specific functions.
|
||||
|
||||
ispc now has partial support for 'goto' statements; specifically, goto is
|
||||
allowed if any enclosing control flow statements (if/for/while/do) have
|
||||
'uniform' test expressions, but not if they have 'varying' tests.
|
||||
|
||||
A number of improvements have been made to the code generated for gathers
|
||||
and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
|
||||
addressing calculations) improved the performance of the noise example by
|
||||
14%.
|
||||
|
||||
Many small bugs have been fixed in this release as well, including issue
|
||||
numbers 138, 129, 135, 127, 149, and 142.
|
||||
|
||||
=== v1.1.1 === (15 December 2011)
|
||||
|
||||
This release doesn't include any significant new functionality, but does
|
||||
include a small improvements in generated code and a number of bug fixes.
|
||||
|
||||
The one user-visible language change is that integer constants may be
|
||||
specified with 'u' and 'l' suffixes, like in C. For example, "1024llu"
|
||||
defines the constant with unsigned 64-bit type.
|
||||
|
||||
More informative and useful error messages are printed when function
|
||||
overload resolution fails.
|
||||
|
||||
Masking is avoided in additional cases when the mask can be
|
||||
statically-determined to be all on.
|
||||
|
||||
A number of small bugs have been fixed:
|
||||
- Under some circumstances, incorrect masks were used when assigning a
|
||||
value to a reference and when doing gathers/scatters.
|
||||
- Incorrect code could be generated in some cases when some instances
|
||||
returned part way through a function but others contineud executing.
|
||||
- Type checking wasn't being performed for calls through function pointers;
|
||||
now an error is issued if the arguments don't match up, etc.
|
||||
- Incorrect code was being generated for gather/scatter to structs that had
|
||||
elements with varying short-vector types.
|
||||
- Typechecking wasn't being performed for "foreach" statements; this led to
|
||||
problems like function overload resolution not being performed if an
|
||||
overloaded function call was used to determine the iteration range..
|
||||
- A number of symbols would be multiply-defined when compiling to multiple
|
||||
targets and using the sse2-x2 target as one of them (issue #131).
|
||||
|
||||
=== v1.1.0 === (5 December 2011)
|
||||
|
||||
This is a major new release of the compiler, with significant additions to
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
for i in ispc perfguide faq; do
|
||||
rst2html.py --template=template.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css $i.txt > $i.html
|
||||
rst2html --template=template.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css $i.rst > $i.html
|
||||
done
|
||||
|
||||
rst2html.py --template=template-perf.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css perf.txt > perf.html
|
||||
rst2html --template=template-news.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css news.rst > news.html
|
||||
|
||||
rst2html --template=template-perf.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css perf.rst > perf.html
|
||||
|
||||
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
|
||||
#pdflatex ispc.tex
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
=============================================================
|
||||
Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
|
||||
=============================================================
|
||||
=====================================
|
||||
Frequently Asked Questions About ispc
|
||||
=====================================
|
||||
|
||||
This document includes a number of frequently (and not frequently) asked
|
||||
questions about ispc, the Intel® SPMD Program Compiler. The source to this
|
||||
document is in the file ``docs/faq.txt`` in the ``ispc`` source
|
||||
document is in the file ``docs/faq.rst`` in the ``ispc`` source
|
||||
distribution.
|
||||
|
||||
* Understanding ispc's Output
|
||||
@@ -14,11 +14,24 @@ distribution.
|
||||
+ `Why are there multiple versions of exported ispc functions in the assembly output?`_
|
||||
+ `How can I more easily see gathers and scatters in generated assembly?`_
|
||||
|
||||
* Running The Compiler
|
||||
|
||||
+ `Why is it required to use one of the "generic" targets with C++ output?`_
|
||||
+ `Why won't the compiler generate an object file or assembly output with the "generic" targets?`_
|
||||
|
||||
* Language Details
|
||||
|
||||
+ `What is the difference between "int *foo" and "int foo[]"?`_
|
||||
+ `Why are pointed-to types "uniform" by default?`_
|
||||
+ `What am I getting an error about assigning a varying lvalue to a reference type?`_
|
||||
|
||||
* Interoperability
|
||||
|
||||
+ `How can I supply an initial execution mask in the call from the application?`_
|
||||
+ `How can I generate a single binary executable with support for multiple instruction sets?`_
|
||||
+ `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
|
||||
+ `Is it possible to inline ispc functions in C/C++ code?`_
|
||||
+ `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_
|
||||
|
||||
* Programming Techniques
|
||||
|
||||
@@ -26,6 +39,8 @@ distribution.
|
||||
+ `How can a gang of program instances generate variable amounts of output efficiently?`_
|
||||
+ `Is it possible to use ispc for explicit vector programming?`_
|
||||
+ `How can I debug my ispc programs using Valgrind?`_
|
||||
+ `foreach statements generate more complex assembly than I'd expect; what's going on?`_
|
||||
+ `How do I launch an individual task for each active program instance?`_
|
||||
|
||||
Understanding ispc's Output
|
||||
===========================
|
||||
@@ -212,6 +227,174 @@ easier to understand:
|
||||
jmp ___pseudo_scatter_base_offsets32_32 ## TAILCALL
|
||||
|
||||
|
||||
Running The Compiler
|
||||
====================
|
||||
|
||||
Why is it required to use one of the "generic" targets with C++ output?
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
The C++ output option transforms the provided ``ispc`` program source into
|
||||
C++ code where each basic operation in the program (addition, comparison,
|
||||
etc.) is represented as a function call to an as-yet-undefined function,
|
||||
chaining the results of these calls together to perform the required
|
||||
computations. It is then expected that the user will provide the
|
||||
implementation of these functions via a header file with ``inline``
|
||||
functions defined for each of these functions and then use a C++ compiler
|
||||
to generate a final object file. (Examples of these headers include
|
||||
``examples/intrinsics/sse4.h`` and ``examples/intrinsics/knc.h`` in the
|
||||
``ispc`` distribution.)
|
||||
|
||||
If a target other than one of the "generic" ones is used with C++ output,
|
||||
then the compiler will transform certain operations into particular code
|
||||
sequences that may not be desired for the actual final target; for example,
|
||||
SSE targets that don't have hardware "gather" instructions will transform a
|
||||
gather into a sequence of scalar load instructions. When this in turn is
|
||||
transformed to C++ code, the fact that the loads were originally a gather
|
||||
is lost, and the header file of function definitions wouldn't have a chance
|
||||
to map the "gather" to a target-specific operation, as the ``knc.h`` header
|
||||
does, for example. Thus, the "generic" targets exist to provide basic
|
||||
targets of various vector widths, without imposing any limitations on the
|
||||
final target's capabilities.
|
||||
|
||||
Why won't the compiler generate an object file or assembly output with the "generic" targets?
|
||||
---------------------------------------------------------------------------------------------
|
||||
|
||||
As described in the above FAQ entry, when compiling to the "generic"
|
||||
targets, ``ispc`` generates vector code for the source program that
|
||||
transforms every basic operation in the program (addition, comparison,
|
||||
etc.) into a separate function call.
|
||||
|
||||
While there is no fundamental reason that the compiler couldn't generate
|
||||
target-specific object code with a function call to an undefined function
|
||||
for each primitive operation, doing so wouldn't actually be useful in
|
||||
practice--providing definitions of these functions in a separate object
|
||||
file and actually performing function calls for each of them (versus
|
||||
turning them into inline function calls) would be a highly inefficient way
|
||||
to run the program.
|
||||
|
||||
Therefore, in the interests of encouraging the use of the system,
|
||||
these types of output are disallowed.
|
||||
|
||||
|
||||
Language Details
|
||||
================
|
||||
|
||||
What is the difference between "int \*foo" and "int foo[]"?
|
||||
-----------------------------------------------------------
|
||||
|
||||
In C and C++, declaring a function to take a parameter ``int *foo`` and
|
||||
``int foo[]`` results in the same type for the parameter. Both are
|
||||
pointers to integers. In ``ispc``, these are different types. The first
|
||||
one is a varying pointer to a uniform integer value in memory, while the
|
||||
second results in a uniform pointer to the start of an array of varying
|
||||
integer values in memory.
|
||||
|
||||
To understand why the first is a varying pointer to a uniform integer,
|
||||
first recall that types without explicit rate qualifiers (``uniform``,
|
||||
``varying``, or ``soa<>``) are ``varying`` by default. Second, recall from
|
||||
the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
|
||||
types without rate qualifiers are ``uniform`` by default. (This second
|
||||
rule is discussed further below, in `Why are pointed-to types "uniform" by
|
||||
default?`_.) The type of ``int *foo`` follows from these.
|
||||
|
||||
.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types
|
||||
|
||||
Conversely, in a function body, ``int foo[10]`` represents a declaration of
|
||||
a 10-element array of varying ``int`` values. In that we'd certainly like
|
||||
to be able to pass such an array to a function that takes a ``int []``
|
||||
parameter, the natural type for an ``int []`` parameter is a uniform
|
||||
pointer to varying integer values.
|
||||
|
||||
In terms of compatibility with C/C++, it's unfortunate that this
|
||||
distinction exists, though any other set of rules seems to introduce more
|
||||
awkwardness than this one. (Though we're interested to hear ideas to
|
||||
improve these rules!).
|
||||
|
||||
Why are pointed-to types "uniform" by default?
|
||||
----------------------------------------------
|
||||
|
||||
In ``ispc``, types without rate qualifiers are "varying" by default, but
|
||||
types pointed to by pointers without rate qualifiers are "uniform" by
|
||||
default. Why this difference?
|
||||
|
||||
::
|
||||
|
||||
int foo; // no rate qualifier, "varying int".
|
||||
uniform int *foo; // pointer type has no rate qualifier, pointed-to does.
|
||||
// "varying pointer to uniform int".
|
||||
int *foo; // neither pointer type nor pointed-to type ("int") have
|
||||
// rate qualifiers. Pointer type is varying by default,
|
||||
// pointed-to is uniform. "varying pointer to uniform int".
|
||||
varying int *foo; // varying pointer to varying int
|
||||
|
||||
The first rule, having types without rate qualifiers be varying by default,
|
||||
is a default that keeps the number of "uniform" or "varying" qualifiers in
|
||||
``ispc`` programs low. Most ``ispc`` programs use mostly "varying"
|
||||
variables, so this rule allows most variables to be declared without also
|
||||
requiring rate qualifiers.
|
||||
|
||||
On a related note, this rule allows many C/C++ functions to be used to
|
||||
define equivalent functions in the SPMD execution model that ``ispc``
|
||||
provides with little or no modification:
|
||||
|
||||
::
|
||||
|
||||
// scalar add in C/C++, SPMD/vector add in ispc
|
||||
int add(int a, int b) { return a + b; }
|
||||
|
||||
This motivation also explains why ``uniform int *foo`` represents a varying
|
||||
pointer; having pointers be varying by default if they don't have rate
|
||||
qualifiers similarly helps with porting code from C/C++ to ``ispc``.
|
||||
|
||||
The tricker issue is why pointed-to types are "uniform" by default. In our
|
||||
experience, data in memory that is accessed via pointers is most often
|
||||
uniform; this generally includes all data that has been allocated and
|
||||
initialized by the C/C++ application code. In practice, "varying" types are
|
||||
more generally (but not exclusively) used for local data in ``ispc``
|
||||
functions. Thus, making the pointed-to type uniform by default leads to
|
||||
more concise code for the most common cases.
|
||||
|
||||
|
||||
What am I getting an error about assigning a varying lvalue to a reference type?
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Given code like the following:
|
||||
|
||||
::
|
||||
|
||||
uniform float a[...];
|
||||
int index = ...;
|
||||
float &r = a[index];
|
||||
|
||||
``ispc`` issues the error "Initializer for reference-type variable "r" must
|
||||
have a uniform lvalue type.". The underlying issue stems from how
|
||||
references are represented in the code generated by ``ispc``. Recall that
|
||||
``ispc`` supports both uniform and varying pointer types--a uniform pointer
|
||||
points to the same location in memory for all program instances in the
|
||||
gang, while a varying pointer allows each program instance to have its own
|
||||
pointer value.
|
||||
|
||||
References are represented a pointer in the code generated by ``ispc``,
|
||||
though this is generally opaque to the user; in ``ispc``, they are
|
||||
specifically uniform pointers. This design decision was made so that given
|
||||
code like this:
|
||||
|
||||
::
|
||||
|
||||
extern void func(float &val);
|
||||
float foo = ...;
|
||||
func(foo);
|
||||
|
||||
Then the reference would be handled efficiently as a single pointer, rather
|
||||
than unnecessarily being turned into a gang-size of pointers.
|
||||
|
||||
However, an implication of this decision is that it's not possible for
|
||||
references to refer to completely different things for each of the program
|
||||
instances. (And hence the error that is issued). In cases where a unique
|
||||
per-program-instance pointer is needed, a varying pointer should be used
|
||||
instead of a reference.
|
||||
|
||||
|
||||
Interoperability
|
||||
================
|
||||
|
||||
@@ -273,10 +456,10 @@ Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
|
||||
``foo_avx.o``, and ``foo.o``.[#]_ Link all of these into your executable, and
|
||||
when you call a function in ``foo.ispc`` from your application code,
|
||||
``ispc`` will determine which instruction sets are supported by the CPU the
|
||||
code is running on and will call the most appropraite version of the
|
||||
code is running on and will call the most appropriate version of the
|
||||
function available.
|
||||
|
||||
.. [#] Similarly, if you choose to generate assembly langauage output or
|
||||
.. [#] Similarly, if you choose to generate assembly language output or
|
||||
LLVM bitcode output, multiple versions of those files will be created.
|
||||
|
||||
In general, the version of the function that runs will be the one in the
|
||||
@@ -346,6 +529,92 @@ In a similar fashion, it's possible to find out at run-time the value of
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
Is it possible to inline ispc functions in C/C++ code?
|
||||
------------------------------------------------------
|
||||
|
||||
If you're willing to use the ``clang`` C/C++ compiler that's part of the
|
||||
LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
|
||||
(and conversely, to inline C/C++ calls in ``ispc``). Doing so can provide
|
||||
performance advantages when calling out to short functions written in the
|
||||
"other" language. Note that you don't need to use ``clang`` to compile all
|
||||
of your C/C++ code, but only for the files where you want to be able to
|
||||
inline. In order to do this, you must have a full installation of LLVM
|
||||
version 3.0 or later, including the ``clang`` compiler.
|
||||
|
||||
The basic approach is to have the various compilers emit LLVM intermediate
|
||||
representation (IR) code and to then use tools from LLVM to link together
|
||||
the IR from the compilers and then re-optimize it, which gives the LLVM
|
||||
optimizer the opportunity to do additional inlining and cross-function
|
||||
optimizations. If you have source files ``foo.ispc`` and ``foo.cpp``,
|
||||
first emit LLVM IR:
|
||||
|
||||
::
|
||||
|
||||
ispc --emit-llvm -o foo_ispc.bc foo.ispc
|
||||
clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
|
||||
|
||||
Next, link the two IR files into a single file and run the LLVM optimizer
|
||||
on the result:
|
||||
|
||||
::
|
||||
|
||||
llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
|
||||
|
||||
And finally, generate a native object file:
|
||||
|
||||
::
|
||||
|
||||
llc -filetype=obj foo_opt.bc -o foo.o
|
||||
|
||||
This file can in turn be linked in with the rest of your object files when
|
||||
linking your applicaiton.
|
||||
|
||||
(Note that if you're using the AVX instruction set, you must provide the
|
||||
``-mattr=+avx`` flag to ``llc``.)
|
||||
|
||||
|
||||
Why is it illegal to pass "varying" values from C/C++ to ispc functions?
|
||||
------------------------------------------------------------------------
|
||||
|
||||
If any of the types in the parameter list to an exported function is
|
||||
"varying" (including recursively, and members of structure types, etc.),
|
||||
then ``ispc`` will issue an error and refuse to compile the function:
|
||||
|
||||
::
|
||||
|
||||
% echo "export int add(int x) { return ++x; }" | ispc
|
||||
<stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo"
|
||||
<stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function.
|
||||
|
||||
While there's no fundamental reason why this isn't possible, recall the
|
||||
definition of "varying" variables: they have one value for each program
|
||||
instance in the gang. As such, the number of values and amount of storage
|
||||
required to represent a varying variable depends on the gang size
|
||||
(i.e. ``programCount``), which can have different values depending on the
|
||||
compilation target.
|
||||
|
||||
``ispc`` therefore prohibits passing "varying" values between the
|
||||
application and the ``ispc`` program in order to prevent the
|
||||
application-side code from depending on a particular gang size, in order to
|
||||
encourage portability to different gang sizes. (A generally desirable
|
||||
programming practice.)
|
||||
|
||||
For cases where the size of data is actually fixed from the application
|
||||
side, the value can be passed via a pointer to a short ``uniform`` array,
|
||||
as follows:
|
||||
|
||||
::
|
||||
|
||||
export void add4(uniform int ptr[4]) {
|
||||
foreach (i = 0 ... 4)
|
||||
ptr[i]++;
|
||||
}
|
||||
|
||||
On the 4-wide SSE instruction set, this compiles to a single vector add
|
||||
instruction (and associated move instructions), while it still also
|
||||
efficiently computes the correct result on 8-wide AVX targets.
|
||||
|
||||
|
||||
Programming Techniques
|
||||
======================
|
||||
|
||||
@@ -480,3 +749,131 @@ you can use ``--target=sse4`` when compiling to run with ``valgrind``.
|
||||
Note that ``valgrind`` does not yet support programs that use the AVX
|
||||
instruction set.
|
||||
|
||||
foreach statements generate more complex assembly than I'd expect; what's going on?
|
||||
-----------------------------------------------------------------------------------
|
||||
|
||||
Given a simple ``foreach`` loop like the following:
|
||||
|
||||
::
|
||||
|
||||
void foo(uniform float a[], uniform int count) {
|
||||
foreach (i = 0 ... count)
|
||||
a[i] *= 2;
|
||||
}
|
||||
|
||||
|
||||
the ``ispc`` compiler generates approximately 40 instructions--why isn't
|
||||
the generated code simpler?
|
||||
|
||||
There are two main components to the code: one handles
|
||||
``programCount``-sized chunks of elements of the array, and the other
|
||||
handles any excess elements at the end of the array that don't completely
|
||||
fill a gang. The code for the main loop is essentially what one would
|
||||
expect: a vector of values are laoded from the array, the multiply is done,
|
||||
and the result is stored.
|
||||
|
||||
::
|
||||
|
||||
LBB0_2: ## %foreach_full_body
|
||||
movslq %edx, %rdx
|
||||
vmovups (%rdi,%rdx), %ymm1
|
||||
vmulps %ymm0, %ymm1, %ymm1
|
||||
vmovups %ymm1, (%rdi,%rdx)
|
||||
addl $32, %edx
|
||||
addl $8, %eax
|
||||
cmpl %ecx, %eax
|
||||
jl LBB0_2
|
||||
|
||||
|
||||
Then, there is a sequence of instructions that handles any additional
|
||||
elements at the end of the array. (These instructions don't execute if
|
||||
there aren't any left-over values to process, but they do lengthen the
|
||||
amount of generated code.)
|
||||
|
||||
::
|
||||
|
||||
## BB#4: ## %partial_inner_only
|
||||
vmovd %eax, %xmm0
|
||||
vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
vpermilps $0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
vextractf128 $1, %ymm0, %xmm3
|
||||
vmovd %esi, %xmm2
|
||||
vmovaps LCPI0_1(%rip), %ymm1
|
||||
vextractf128 $1, %ymm1, %xmm4
|
||||
vpaddd %xmm4, %xmm3, %xmm3
|
||||
# ....
|
||||
vmulps LCPI0_0(%rip), %ymm1, %ymm1
|
||||
vmaskmovps %ymm1, %ymm0, (%rdi,%rax)
|
||||
|
||||
|
||||
If you know that the number of elements to be processed will always be an
|
||||
exact multiple of the 8, 16, etc., then adding a simple assignment to
|
||||
``count`` like the one below gives the compiler enough information to be
|
||||
able to eliminate the code for the additional array elements.
|
||||
|
||||
::
|
||||
|
||||
void foo(uniform float a[], uniform int count) {
|
||||
// This assignment doesn't change the value of count
|
||||
// if it's a multiple of 16, but it gives the compiler
|
||||
// insight into this fact, allowing for simpler code to
|
||||
// be generated for the foreach loop.
|
||||
count = (count & ~(16-1));
|
||||
foreach (i = 0 ... count)
|
||||
a[i] *= 2;
|
||||
}
|
||||
|
||||
With this new version of ``foo()``, only the code for the first loop above
|
||||
is generated.
|
||||
|
||||
|
||||
How do I launch an individual task for each active program instance?
|
||||
--------------------------------------------------------------------
|
||||
|
||||
Recall from the `discussion of "launch" in the ispc User's Guide`_ that a
|
||||
``launch`` statement launches a single task corresponding to a single gang
|
||||
of executing program instances, where the indices of the active program
|
||||
instances are the same as were active when the ``launch`` statement
|
||||
executed.
|
||||
|
||||
.. _discussion of "launch" in the ispc User's Guide: ispc.html#task-parallelism-launch-and-sync-statements
|
||||
|
||||
In some situations, it's desirable to be able to launch an individual task
|
||||
for each executing program instance. For example, we might be performing
|
||||
an iterative computation where a subset of the program instances determine
|
||||
that an item they are responsible for requires additional processing.
|
||||
|
||||
::
|
||||
|
||||
bool itemNeedsMoreProcessing(int);
|
||||
int itemNum = ...;
|
||||
if (itemNeedsMoreProcessing(itemNum)) {
|
||||
// do additional work
|
||||
}
|
||||
|
||||
For performance reasons, it may be desirable to apply an entire gang's
|
||||
worth of comptuation to each item that needs additional processing;
|
||||
there may be available parallelism in this computation such that we'd like
|
||||
to process each of the items with SPMD computation.
|
||||
|
||||
In this case, the ``foreach_active`` and ``unmasked`` constructs can be
|
||||
applied together to accomplish this goal.
|
||||
|
||||
::
|
||||
|
||||
// do additional work
|
||||
task void doWork(uniform int index);
|
||||
foreach_active (index) {
|
||||
unmasked {
|
||||
launch doWork(extract(itemNum, index));
|
||||
}
|
||||
}
|
||||
|
||||
Recall that the body of the ``foreach_active`` loop runs once for each
|
||||
active program instance, with each active program instance's
|
||||
``programIndex`` value available in ``index`` in the above. In the loop,
|
||||
we can re-establish an "all on" execution mask, enabling execution in all
|
||||
of the program instances in the gang, such that execution in ``doWork()``
|
||||
starts with all instances running. (Alternatively, the ``unmasked`` block
|
||||
could be in the definition of ``doWork()``.)
|
||||
|
||||
2155
docs/ispc.txt → docs/ispc.rst
Normal file → Executable file
2155
docs/ispc.txt → docs/ispc.rst
Normal file → Executable file
File diff suppressed because it is too large
Load Diff
79
docs/news.rst
Normal file
79
docs/news.rst
Normal file
@@ -0,0 +1,79 @@
|
||||
=========
|
||||
ispc News
|
||||
=========
|
||||
|
||||
ispc 1.4.1 is Released
|
||||
----------------------
|
||||
|
||||
A major new version of ``ispc`` has been released with stability and
|
||||
performance improvements on all supported platforms (Windows, Linux and MacOS).
|
||||
This version supports LLVM 3.1, 3.2, 3.3 and 3.4. The released binaries are
|
||||
built with 3.2.
|
||||
|
||||
ispc 1.3.0 is Released
|
||||
----------------------
|
||||
|
||||
A major new version of ``ispc`` has been released. In addition to a number
|
||||
of new language features, this release notably features initial support for
|
||||
compiling to the Intel Xeon Phi (Many Integrated Core) architecture.
|
||||
|
||||
ispc 1.2.1 is Released
|
||||
----------------------
|
||||
|
||||
This is a bugfix release, fixing approximately 20 bugs in the system and
|
||||
improving error handling and error reporting. New functionality includes
|
||||
very efficient float/half conversion routines thanks to Fabian
|
||||
Giesen. See the `1.2.1 release notes`_ for details.
|
||||
|
||||
.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
|
||||
|
||||
ispc 1.2.0 is Released
|
||||
-----------------------
|
||||
|
||||
A new major release was posted on March 20, 2012. This release includes
|
||||
significant new functionality for cleanly handling "structure of arrays"
|
||||
(SoA) data layout and a new model for how uniform and varying are handled
|
||||
with structure types.
|
||||
|
||||
Paper on ispc To Appear in InPar 2012
|
||||
-------------------------------------
|
||||
|
||||
A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
|
||||
CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
|
||||
the `InPar 2012`_ conference. This paper describes a number of the design
|
||||
features and key characteristics of the ``ispc`` implementation.
|
||||
|
||||
(© 2012 IEEE. Personal use of this material is permitted. Permission from
|
||||
IEEE must be obtained for all other uses, in any current or future media,
|
||||
including reprinting/republishing this material for advertising or
|
||||
promotional purposes, creating new collective works, for resale or
|
||||
redistribution to servers or lists, or reuse of any copyrighted component
|
||||
of this work in other works.).
|
||||
|
||||
.. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
|
||||
.. _InPar 2012: http://innovativeparallel.org/
|
||||
|
||||
ispc 1.1.4 is Released
|
||||
----------------------
|
||||
|
||||
On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
|
||||
include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
|
||||
programs, "local" atomic operations in the standard library, and a new
|
||||
scalar compilation target. See the `1.1.4 release notes`_ for details.
|
||||
|
||||
.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
|
||||
|
||||
|
||||
ispc 1.1.3 is Released
|
||||
----------------------
|
||||
|
||||
With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
|
||||
|
||||
This release includes fixes for two important performance related issues:
|
||||
the quality of code generated for "foreach" statements has been
|
||||
substantially improved, and performance regression with code for "gathers"
|
||||
that was introduced in v1.1.2 has been fixed in this release.
|
||||
|
||||
Thanks to Jean-Luc Duprat for a number of patches that improve support for
|
||||
building on various platforms, and to Pierre-Antoine Lacaze for patches so
|
||||
that ispc builds under MinGW.
|
||||
@@ -22,8 +22,8 @@ also included in the ``examples/`` directory.)
|
||||
- ``ispc``, 1 core
|
||||
- ``ispc``, 4 cores
|
||||
* - `AOBench`_ (512 x 512 resolution)
|
||||
- 3.99x
|
||||
- 19.32x
|
||||
- 6.19x
|
||||
- 28.06x
|
||||
* - `Binomial Options`_ (128k options)
|
||||
- 7.94x
|
||||
- 33.43x
|
||||
@@ -31,23 +31,23 @@ also included in the ``examples/`` directory.)
|
||||
- 8.45x
|
||||
- 32.48x
|
||||
* - `Deferred Shading`_ (1280p)
|
||||
- n/a
|
||||
- 5.02x
|
||||
- 23.06x
|
||||
* - `Mandelbrot Set`_
|
||||
- 6.21x
|
||||
- 19.90x
|
||||
- 20.28x
|
||||
* - `Perlin Noise Function`_
|
||||
- 5.37x
|
||||
- n/a
|
||||
* - `Ray Tracer`_ (Sponza dataset)
|
||||
- 3.99x
|
||||
- 19.32x
|
||||
- 4.31x
|
||||
- 20.29x
|
||||
* - `3D Stencil`_
|
||||
- 3.76x
|
||||
- 13.79x
|
||||
- 4.05x
|
||||
- 15.53x
|
||||
* - `Volume Rendering`_
|
||||
- 3.11x
|
||||
- 15.80x
|
||||
- 3.60x
|
||||
- 17.53x
|
||||
|
||||
|
||||
.. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
|
||||
@@ -13,6 +13,7 @@ the most out of ``ispc`` in practice.
|
||||
+ `Improving Control Flow Coherence With "foreach_tiled"`_
|
||||
+ `Using Coherent Control Flow Constructs`_
|
||||
+ `Use "uniform" Whenever Appropriate`_
|
||||
+ `Use "Structure of Arrays" Layout When Possible`_
|
||||
|
||||
* `Tips and Techniques`_
|
||||
|
||||
@@ -20,6 +21,7 @@ the most out of ``ispc`` in practice.
|
||||
+ `Avoid 64-bit Addressing Calculations When Possible`_
|
||||
+ `Avoid Computation With 8 and 16-bit Integer Types`_
|
||||
+ `Implementing Reductions Efficiently`_
|
||||
+ `Using "foreach_active" Effectively`_
|
||||
+ `Using Low-level Vector Tricks`_
|
||||
+ `The "Fast math" Option`_
|
||||
+ `"inline" Aggressively`_
|
||||
@@ -64,7 +66,7 @@ on each one:
|
||||
Depending on the specifics of the computation being performed, the code
|
||||
generated for this function could likely be improved by modifying the code
|
||||
so that the loop only goes as far through the data as is possible to pack
|
||||
an entire gang of program instances with computation each time thorugh the
|
||||
an entire gang of program instances with computation each time through the
|
||||
loop. Doing so enables the ``ispc`` compiler to generate more efficient
|
||||
code for cases where it knows that the execution mask is "all on". Then,
|
||||
an ``if`` statement at the end handles processing the ragged extra bits of
|
||||
@@ -153,7 +155,7 @@ processed, and so forth.
|
||||
|
||||
Performance benefit can come from using ``foreach_tiled`` in that it
|
||||
essentially optimizes for the benefit of iterating over *compact* regions
|
||||
of the domian (while ``foreach`` iterates over the domain in a way that
|
||||
of the domain (while ``foreach`` iterates over the domain in a way that
|
||||
generally allows linear memory access.) There are two benefits from
|
||||
processing compact regions of the domain.
|
||||
|
||||
@@ -215,7 +217,7 @@ Use "uniform" Whenever Appropriate
|
||||
----------------------------------
|
||||
|
||||
For any variable that will always have the same value across all of the
|
||||
program instances in a gang, declare the variable with the ``unfiorm``
|
||||
program instances in a gang, declare the variable with the ``uniform``
|
||||
qualifier. Doing so enables the ``ispc`` compiler to emit better code in
|
||||
many different ways.
|
||||
|
||||
@@ -229,7 +231,7 @@ number of iterations:
|
||||
|
||||
If this is written with ``i`` as a ``varying`` variable, as above, there's
|
||||
additional overhead in the code generated for the loop as the compiler
|
||||
emits instructions to handle the possibilty of not all program instances
|
||||
emits instructions to handle the possibility of not all program instances
|
||||
following the same control flow path (as might be the case if the loop
|
||||
limit, 10, was itself a ``varying`` value.)
|
||||
|
||||
@@ -247,6 +249,76 @@ but it's always best to provide the compiler with as much help as possible
|
||||
to understand the actual form of your computation.
|
||||
|
||||
|
||||
Use "Structure of Arrays" Layout When Possible
|
||||
----------------------------------------------
|
||||
|
||||
In general, memory access performance (for both reads and writes) is best
|
||||
when the running program instances access a contiguous region of memory; in
|
||||
this case efficient vector load and store instructions can often be used
|
||||
rather than gathers and scatters. As an example of this issue, consider an
|
||||
array of a simple point datatype laid out and accessed in conventional
|
||||
"array of structures" (AOS) layout:
|
||||
|
||||
::
|
||||
|
||||
struct Point { float x, y, z; };
|
||||
uniform Point pts[...];
|
||||
float v = pts[programIndex].x;
|
||||
|
||||
In the above code, the access to ``pts[programIndex].x`` accesses
|
||||
non-sequential memory locations, due to the ``y`` and ``z`` values between
|
||||
the desired ``x`` values in memory. A "gather" is required to get the
|
||||
value of ``v``, with a corresponding decrease in performance.
|
||||
|
||||
If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
|
||||
can be much more efficient:
|
||||
|
||||
::
|
||||
|
||||
struct Point8 { float x[8], y[8], z[8]; };
|
||||
uniform Point8 pts8[...];
|
||||
int majorIndex = programIndex / 8;
|
||||
int minorIndex = programIndex % 8;
|
||||
float v = pts8[majorIndex].x[minorIndex];
|
||||
|
||||
In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
|
||||
before 8 ``y`` values and then 8 ``z`` values. If the gang size is 8 or
|
||||
less, the access for ``v`` will have the same value of ``majorIndex`` for
|
||||
all program instances and will access consecutive elements of the ``x[8]``
|
||||
array with a vector load. (For larger gang sizes, two 8-wide vector loads
|
||||
would be issues, which is also quite efficient.)
|
||||
|
||||
However, the syntax in the above code is messy; accessing SOA data in this
|
||||
fashion is much less elegant than the corresponding code for accessing the
|
||||
data with AOS layout. The ``soa`` qualifier in ``ispc`` can be used to
|
||||
cause the corresponding transformation to be made to the ``Point`` type,
|
||||
while preserving the clean syntax for data access that comes with AOS
|
||||
layout:
|
||||
|
||||
::
|
||||
|
||||
soa<8> Point pts[...];
|
||||
float v = pts[programIndex].x;
|
||||
|
||||
Thanks to having SOA layout a first-class concept in the language's type
|
||||
system, it's easy to write functions that convert data between the
|
||||
layouts. For example, the ``aos_to_soa`` function below converts ``count``
|
||||
elements of the given ``Point`` type from AOS to 8-wide SOA layout. (It
|
||||
assumes that the caller has pre-allocated sufficient space in the
|
||||
``pts_soa`` output array.
|
||||
|
||||
::
|
||||
|
||||
void aos_to_soa(uniform Point pts_aos[], uniform int count,
|
||||
soa<8> pts_soa[]) {
|
||||
foreach (i = 0 ... count)
|
||||
pts_soa[i] = pts_aos[i];
|
||||
}
|
||||
|
||||
Analogously, a function could be written to convert back from SOA to AOS if
|
||||
needed.
|
||||
|
||||
|
||||
Tips and Techniques
|
||||
===================
|
||||
|
||||
@@ -339,6 +411,12 @@ based on the index, it can be worth doing. See the example
|
||||
``examples/volume_rendering`` in the ``ispc`` distribution for the use of
|
||||
this technique in an instance where it is beneficial to performance.
|
||||
|
||||
Understanding Memory Read Coalescing
|
||||
------------------------------------
|
||||
|
||||
XXXX todo
|
||||
|
||||
|
||||
Avoid 64-bit Addressing Calculations When Possible
|
||||
--------------------------------------------------
|
||||
|
||||
@@ -433,6 +511,43 @@ values--very efficient code in the end.
|
||||
return reduce_add(sum);
|
||||
}
|
||||
|
||||
Using "foreach_active" Effectively
|
||||
----------------------------------
|
||||
|
||||
For high-performance code,
|
||||
|
||||
For example, consider this segment of code, from the introduction of
|
||||
``foreach_active`` in the ispc User's Guide:
|
||||
|
||||
::
|
||||
|
||||
uniform float array[...] = { ... };
|
||||
int index = ...;
|
||||
foreach_active (i) {
|
||||
++array[index];
|
||||
}
|
||||
|
||||
Here, ``index`` was assumed to possibly have the same value for multiple
|
||||
program instances, so the updates to ``array[index]`` are serialized by the
|
||||
``foreach_active`` statement in order to not have undefined results when
|
||||
``index`` values do collide.
|
||||
|
||||
The code generated by the compiler can be improved in this case by making
|
||||
it clear that only a single element of the array is accessed by
|
||||
``array[index]`` and that thus a general gather or scatter isn't required.
|
||||
Specifically, by using the ``extract()`` function from the standard library
|
||||
to extract the current program instance's value of ``index`` into a
|
||||
``uniform`` variable and then using that to index into ``array``, as below,
|
||||
more efficient code is generated.
|
||||
|
||||
::
|
||||
|
||||
foreach_active (instanceNum) {
|
||||
uniform int unifIndex = extract(index, instanceNum);
|
||||
++array[unifIndex];
|
||||
}
|
||||
|
||||
|
||||
Using Low-level Vector Tricks
|
||||
-----------------------------
|
||||
|
||||
@@ -547,7 +662,7 @@ gathers happen.)
|
||||
|
||||
extern "C" {
|
||||
void ISPCInstrument(const char *fn, const char *note,
|
||||
int line, int mask);
|
||||
int line, uint64_t mask);
|
||||
}
|
||||
|
||||
This function is passed the file name of the ``ispc`` file running, a short
|
||||
@@ -560,7 +675,7 @@ as follows:
|
||||
|
||||
::
|
||||
|
||||
ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
|
||||
ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
|
||||
|
||||
This call indicates that at the currently executing program has just
|
||||
entered the function defined at line 55 of the file ``foo.ispc``, with a
|
||||
@@ -568,7 +683,7 @@ mask of all lanes currently executing (assuming a four-wide gang size
|
||||
target machine).
|
||||
|
||||
For a fuller example of the utility of this functionality, see
|
||||
``examples/aobench_instrumented`` in the ``ispc`` distribution. Ths
|
||||
``examples/aobench_instrumented`` in the ``ispc`` distribution. This
|
||||
example includes an implementation of the ``ISPCInstrument()`` function
|
||||
that collects aggregate data about the program's execution behavior.
|
||||
|
||||
66
docs/template-news.txt
Normal file
66
docs/template-news.txt
Normal file
@@ -0,0 +1,66 @@
|
||||
%(head_prefix)s
|
||||
%(head)s
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
%(stylesheet)s
|
||||
%(body_prefix)s
|
||||
<div id="wrap">
|
||||
<div id="wrap2">
|
||||
<div id="header">
|
||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
||||
the CPU</div>
|
||||
</div>
|
||||
<div id="nav">
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li id="selected"><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li><a href="documentation.html">Documentation</a></li>
|
||||
<li><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div id="content-wrap">
|
||||
<div id="sidebar">
|
||||
<div class="widgetspace">
|
||||
<h1>Resources</h1>
|
||||
<ul class="menu">
|
||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
||||
users mailing list</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%(body_pre_docinfo)s
|
||||
%(docinfo)s
|
||||
<div id="content">
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
%(body_suffix)s
|
||||
@@ -26,10 +26,12 @@
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li><a href="documentation.html">Documentation</a></li>
|
||||
<li id="selected"><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
@@ -45,8 +47,7 @@
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
||||
<tt>ispc</tt> source code</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
@@ -56,7 +57,7 @@
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -26,10 +26,12 @@
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li id="selected"><a href="documentation.html">Documentation</a></li>
|
||||
<li><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
@@ -45,8 +47,7 @@
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
||||
<tt>ispc</tt> source code</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
@@ -56,7 +57,7 @@
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
11
doxygen.cfg
11
doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.1.0
|
||||
PROJECT_NUMBER = 1.4.1
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
@@ -581,10 +581,12 @@ WARN_LOGFILE =
|
||||
# directories like "/usr/src/myproject". Separate the files or directories
|
||||
# with spaces.
|
||||
|
||||
INPUT = builtins.h \
|
||||
INPUT = ast.h \
|
||||
builtins.h \
|
||||
ctx.h \
|
||||
decl.h \
|
||||
expr.h \
|
||||
func.h \
|
||||
ispc.h \
|
||||
llvmutil.h \
|
||||
module.h \
|
||||
@@ -593,10 +595,13 @@ INPUT = builtins.h \
|
||||
sym.h \
|
||||
type.h \
|
||||
util.h \
|
||||
ast.cpp \
|
||||
builtins.cpp \
|
||||
cbackend.cpp \
|
||||
ctx.cpp \
|
||||
decl.cpp \
|
||||
expr.cpp \
|
||||
func.cpp \
|
||||
ispc.cpp \
|
||||
llvmutil.cpp \
|
||||
main.cpp \
|
||||
@@ -608,7 +613,7 @@ INPUT = builtins.h \
|
||||
util.cpp \
|
||||
parse.yy \
|
||||
lex.ll \
|
||||
builtins-c.c
|
||||
builtins/builtins.c
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
|
||||
|
||||
@@ -39,9 +39,6 @@ example implementation of this function that counts the number of times the
|
||||
callback is made and records some statistics about control flow coherence
|
||||
is provided in the instrument.cpp file.
|
||||
|
||||
*** Note: on Linux, this example currently hits an assertion in LLVM during
|
||||
*** compilation
|
||||
|
||||
|
||||
Deferred
|
||||
========
|
||||
@@ -76,6 +73,14 @@ This directory includes three implementations of the algorithm:
|
||||
light culling and shading.
|
||||
|
||||
|
||||
GMRES
|
||||
=====
|
||||
|
||||
An implementation of the generalized minimal residual method for solving
|
||||
sparse matrix equations.
|
||||
(http://en.wikipedia.org/wiki/Generalized_minimal_residual_method)
|
||||
|
||||
|
||||
Mandelbrot
|
||||
==========
|
||||
|
||||
@@ -110,6 +115,13 @@ This program implements both the Black-Scholes and Binomial options pricing
|
||||
models in both ispc and regular serial C++ code.
|
||||
|
||||
|
||||
Perfbench
|
||||
=========
|
||||
|
||||
This runs a number of microbenchmarks to measure system performance and
|
||||
code generation quality.
|
||||
|
||||
|
||||
RT
|
||||
==
|
||||
|
||||
|
||||
@@ -1,39 +1,7 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
EXAMPLE=ao
|
||||
CPP_SRC=ao.cpp ao_serial.cpp
|
||||
ISPC_SRC=ao.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
|
||||
|
||||
ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
|
||||
objs/ao_ispc_avx.o
|
||||
OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)
|
||||
|
||||
default: ao
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ ao
|
||||
|
||||
ao: dirs $(OBJS) $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/ao.o: objs/ao_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -50,7 +50,6 @@ struct Isect {
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
@@ -82,8 +81,8 @@ static inline void vnormalize(vec &v) {
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
static void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
@@ -103,7 +102,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
@@ -124,7 +123,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
static void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
@@ -147,8 +146,8 @@ orthoBasis(vec basis[3], vec n) {
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
static float
|
||||
ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
@@ -204,112 +203,52 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static Sphere spheres[3] = {
|
||||
static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static uniform Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, y0);
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
|
||||
// Compute the mapping between the 'programCount'-wide program
|
||||
// instances running in parallel and samples in the image.
|
||||
//
|
||||
// For now, we'll always take four samples per pixel, so start by
|
||||
// initializing du and dv with offsets into subpixel samples. We'll
|
||||
// take care of further updating du and dv for the case where we're
|
||||
// doing more than 4 program instances in parallel shortly.
|
||||
uniform float uSteps[4] = { 0, 1, 0, 1 };
|
||||
uniform float vSteps[4] = { 0, 0, 1, 1 };
|
||||
float du = uSteps[programIndex % 4] / nsubsamples;
|
||||
float dv = vSteps[programIndex % 4] / nsubsamples;
|
||||
foreach_tiled(y = y0 ... y1, x = 0 ... w,
|
||||
u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Now handle the case where we are able to do more than one pixel's
|
||||
// worth of work at once. nx records the number of pixels in the x
|
||||
// direction we do per iteration and ny the number in y.
|
||||
uniform int nx = 1, ny = 1;
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
|
||||
// FIXME: We actually need ny to be 1 regardless of the decomposition,
|
||||
// since the task decomposition is one scanline high.
|
||||
ray.org = 0.f;
|
||||
|
||||
if (programCount == 8) {
|
||||
// Do two pixels at once in the x direction
|
||||
nx = 2;
|
||||
if (programIndex >= 4)
|
||||
// And shift the offsets for the second pixel's worth of work
|
||||
++du;
|
||||
}
|
||||
else if (programCount == 16) {
|
||||
nx = 4;
|
||||
ny = 1;
|
||||
if (programIndex >= 4 && programIndex < 8)
|
||||
++du;
|
||||
if (programIndex >= 8 && programIndex < 12)
|
||||
du += 2;
|
||||
if (programIndex >= 12)
|
||||
du += 3;
|
||||
}
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
// Now loop over all of the pixels, stepping in x and y as calculated
|
||||
// above. (Assumes that ny divides y and nx divides x...)
|
||||
for (uniform int y = y0; y < y1; y += ny) {
|
||||
for (uniform int x = 0; x < w; x += nx) {
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
ray.org = 0.f;
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
cif (isect.hit) {
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
cif (isect.hit)
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
|
||||
// This is a little grungy; we have results for
|
||||
// programCount-worth of values. Because we're doing 2x2
|
||||
// subsamples, we need to peel them off in groups of four,
|
||||
// average the four values for each pixel, and update the
|
||||
// output image.
|
||||
//
|
||||
// Store the varying value to a uniform array of the same size.
|
||||
// See the discussion about communication among program
|
||||
// instances in the ispc user's manual for more discussion on
|
||||
// this idiom.
|
||||
uniform float retArray[programCount];
|
||||
retArray[programIndex] = ret;
|
||||
|
||||
// offset to the first pixel in the image
|
||||
uniform int offset = 3 * (y * w + x);
|
||||
for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
|
||||
// Get the four sample values for this pixel
|
||||
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
||||
retArray[p+3];
|
||||
|
||||
// Normalize by number of samples taken
|
||||
sumret /= nsubsamples * nsubsamples;
|
||||
|
||||
// Store result in the image
|
||||
image[offset+0] = sumret;
|
||||
image[offset+1] = sumret;
|
||||
image[offset+2] = sumret;
|
||||
}
|
||||
int offset = 3 * (y * w + x);
|
||||
atomic_add_local(&image[offset], ret);
|
||||
atomic_add_local(&image[offset+1], ret);
|
||||
atomic_add_local(&image[offset+2], ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -329,5 +268,5 @@ static void task ao_task(uniform int width, uniform int height,
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
||||
launch[h] ao_task(w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
@@ -14,13 +14,13 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ ao
|
||||
|
||||
ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
|
||||
ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread
|
||||
|
||||
objs/%.o: %.cpp
|
||||
objs/%.o: %.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/ao.o: objs/ao_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h
|
||||
|
||||
@@ -211,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, y0);
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
|
||||
// Compute the mapping between the 'programCount'-wide program
|
||||
// instances running in parallel and samples in the image.
|
||||
@@ -329,5 +329,5 @@ static void task ao_task(uniform int width, uniform int height,
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
||||
launch[h] ao_task(w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
74
examples/common.mk
Normal file
74
examples/common.mk
Normal file
@@ -0,0 +1,74 @@
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=objs/tasksys.o
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O2 -m64
|
||||
CC=gcc
|
||||
CCFLAGS=-Iobjs/ -O2 -m64
|
||||
|
||||
LIBS=-lm $(TASK_LIB) -lstdc++
|
||||
ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
|
||||
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
|
||||
$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
|
||||
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
|
||||
|
||||
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
|
||||
CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
|
||||
OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
|
||||
|
||||
default: $(EXAMPLE)
|
||||
|
||||
all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
objs/%.cpp objs/%.o objs/%.h: dirs
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
|
||||
|
||||
$(EXAMPLE): $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/%.o: %.cpp dirs $(ISPC_HEADER)
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: %.c dirs $(ISPC_HEADER)
|
||||
$(CC) $< $(CCFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
|
||||
$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
|
||||
$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
|
||||
$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
|
||||
$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
|
||||
$(ISPC) $< -o $@ --target=generic-1
|
||||
|
||||
$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
@@ -1,38 +1,8 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
EXAMPLE=deferred_shading
|
||||
CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
|
||||
ISPC_SRC=kernels.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_FLAGS=--opt=fast-math
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
|
||||
|
||||
OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
|
||||
objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
|
||||
objs/dynamic_c.o objs/dynamic_cilk.o
|
||||
|
||||
default: deferred_shading
|
||||
|
||||
.PHONY: dirs clean
|
||||
.PRECIOUS: objs/kernels_ispc.h
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ deferred_shading
|
||||
|
||||
deferred_shading: dirs $(OBJS) $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -204,6 +204,7 @@ void WriteFrame(const char *filename, const InputData *input,
|
||||
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
fwrite(framebufferAOS, imageBytes, 1, out);
|
||||
fclose(out);
|
||||
|
||||
lAlignedFree(framebufferAOS);
|
||||
}
|
||||
|
||||
@@ -35,35 +35,35 @@
|
||||
|
||||
struct InputDataArrays
|
||||
{
|
||||
uniform float * uniform zBuffer;
|
||||
uniform unsigned int16 * uniform normalEncoded_x; // half float
|
||||
uniform unsigned int16 * uniform normalEncoded_y; // half float
|
||||
uniform unsigned int16 * uniform specularAmount; // half float
|
||||
uniform unsigned int16 * uniform specularPower; // half float
|
||||
uniform unsigned int8 * uniform albedo_x; // unorm8
|
||||
uniform unsigned int8 * uniform albedo_y; // unorm8
|
||||
uniform unsigned int8 * uniform albedo_z; // unorm8
|
||||
uniform float * uniform lightPositionView_x;
|
||||
uniform float * uniform lightPositionView_y;
|
||||
uniform float * uniform lightPositionView_z;
|
||||
uniform float * uniform lightAttenuationBegin;
|
||||
uniform float * uniform lightColor_x;
|
||||
uniform float * uniform lightColor_y;
|
||||
uniform float * uniform lightColor_z;
|
||||
uniform float * uniform lightAttenuationEnd;
|
||||
float *zBuffer;
|
||||
unsigned int16 *normalEncoded_x; // half float
|
||||
unsigned int16 *normalEncoded_y; // half float
|
||||
unsigned int16 *specularAmount; // half float
|
||||
unsigned int16 *specularPower; // half float
|
||||
unsigned int8 *albedo_x; // unorm8
|
||||
unsigned int8 *albedo_y; // unorm8
|
||||
unsigned int8 *albedo_z; // unorm8
|
||||
float *lightPositionView_x;
|
||||
float *lightPositionView_y;
|
||||
float *lightPositionView_z;
|
||||
float *lightAttenuationBegin;
|
||||
float *lightColor_x;
|
||||
float *lightColor_y;
|
||||
float *lightColor_z;
|
||||
float *lightAttenuationEnd;
|
||||
};
|
||||
|
||||
struct InputHeader
|
||||
{
|
||||
uniform float cameraProj[4][4];
|
||||
uniform float cameraNear;
|
||||
uniform float cameraFar;
|
||||
float cameraProj[4][4];
|
||||
float cameraNear;
|
||||
float cameraFar;
|
||||
|
||||
uniform int32 framebufferWidth;
|
||||
uniform int32 framebufferHeight;
|
||||
uniform int32 numLights;
|
||||
uniform int32 inputDataChunkSize;
|
||||
uniform int32 inputDataArrayOffsets[idaNum];
|
||||
int32 framebufferWidth;
|
||||
int32 framebufferHeight;
|
||||
int32 numLights;
|
||||
int32 inputDataChunkSize;
|
||||
int32 inputDataArrayOffsets[idaNum];
|
||||
};
|
||||
|
||||
|
||||
@@ -158,38 +158,22 @@ IntersectLightsWithTileMinMax(
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
// Parallize across frustum planes.
|
||||
// We really only have four side planes here, but write the code to
|
||||
// handle programCount > 4 robustly
|
||||
uniform float frustumPlanes_xy[programCount];
|
||||
uniform float frustumPlanes_z[programCount];
|
||||
uniform float frustumPlanes_xy[4] = {
|
||||
-(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
uniform float frustumPlanes_z[4] = {
|
||||
tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
// TODO: If programIndex < 4 here? Don't care about masking off the
|
||||
// rest but if interleaving ("x2" modes) the other lanes should ideally
|
||||
// not be emitted...
|
||||
{
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v;
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2, (cameraProj_22 * gBufferScale_y));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
|
||||
|
||||
float frustumPlanes_z_v;
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileEndX - gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 2, tileEndY - gBufferScale_y);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
|
||||
|
||||
// Normalize
|
||||
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||
frustumPlanes_xy_v *= norm;
|
||||
frustumPlanes_z_v *= norm;
|
||||
|
||||
// Save out for uniform use later
|
||||
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||
for (uniform int i = 0; i < 4; ++i) {
|
||||
uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
uniform int32 tileNumLights = 0;
|
||||
@@ -343,8 +327,8 @@ ShadeTile(
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
@@ -355,9 +339,9 @@ ShadeTile(
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
half_to_float(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
half_to_float(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
@@ -530,9 +514,9 @@ RenderStatic(uniform InputHeader &inputHeader,
|
||||
|
||||
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||
// by MIN_TILE_HEIGHT pixels.
|
||||
launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||
launch[num_groups] RenderTile(num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
}
|
||||
|
||||
|
||||
@@ -591,8 +575,6 @@ SplitTileMinMax(
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
|
||||
// indexing math ourselves
|
||||
uniform int32 subtileIndices[],
|
||||
uniform int32 subtileIndicesPitch,
|
||||
uniform int32 subtileNumLights[]
|
||||
@@ -601,30 +583,20 @@ SplitTileMinMax(
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
// Parallize across frustum planes
|
||||
// Only have 2 frustum split planes here so may not be worth it, but
|
||||
// we'll do it for now for consistency
|
||||
uniform float frustumPlanes_xy[programCount];
|
||||
uniform float frustumPlanes_z[programCount];
|
||||
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v;
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_22 * gBufferScale_y));
|
||||
|
||||
float frustumPlanes_z_v;
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
|
||||
uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
// Normalize
|
||||
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||
frustumPlanes_xy_v *= norm;
|
||||
frustumPlanes_z_v *= norm;
|
||||
|
||||
// Save out for uniform use later
|
||||
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||
uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
|
||||
frustumPlanes_z[0] * frustumPlanes_z[0]),
|
||||
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
|
||||
frustumPlanes_z[1] * frustumPlanes_z[1]) };
|
||||
frustumPlanes_xy[0] *= norm[0];
|
||||
frustumPlanes_xy[1] *= norm[1];
|
||||
frustumPlanes_z[0] *= norm[0];
|
||||
frustumPlanes_z[1] *= norm[1];
|
||||
|
||||
// Initialize
|
||||
uniform int32 subtileLightOffset[4];
|
||||
|
||||
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
ispc::RenderStatic(&input->header, &input->arrays,
|
||||
ispc::RenderStatic(input->header, input->arrays,
|
||||
VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer.r, framebuffer.g, framebuffer.b);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
|
||||
@@ -23,6 +23,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
@@ -119,6 +121,14 @@ Global
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
||||
8
examples/gmres/Makefile
Normal file
8
examples/gmres/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=gmres
|
||||
CPP_SRC=algorithm.cpp main.cpp matrix.cpp
|
||||
CC_SRC=mmio.c
|
||||
ISPC_SRC=matrix.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
|
||||
include ../common.mk
|
||||
231
examples/gmres/algorithm.cpp
Normal file
231
examples/gmres/algorithm.cpp
Normal file
@@ -0,0 +1,231 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/*===========================================================================*\
|
||||
|* Includes
|
||||
\*===========================================================================*/
|
||||
#include "algorithm.h"
|
||||
#include "stdio.h"
|
||||
#include "debug.h"
|
||||
|
||||
|
||||
/*===========================================================================*\
|
||||
|* GMRES
|
||||
\*===========================================================================*/
|
||||
/* upper_triangular_right_solve:
|
||||
* ----------------------------
|
||||
* Given upper triangular matrix R and rhs vector b, solve for
|
||||
* x. This "solve" ignores the rows, columns of R that are greater than the
|
||||
* dimensions of x.
|
||||
*/
|
||||
void upper_triangular_right_solve (const DenseMatrix &R, const Vector &b, Vector &x)
|
||||
{
|
||||
// Dimensionality check
|
||||
ASSERT(R.rows() >= b.size());
|
||||
ASSERT(R.cols() >= x.size());
|
||||
ASSERT(b.size() >= x.size());
|
||||
|
||||
int max_row = x.size() - 1;
|
||||
|
||||
// first solve step:
|
||||
x[max_row] = b[max_row] / R(max_row, max_row);
|
||||
|
||||
for (int row = max_row - 1; row >= 0; row--) {
|
||||
double xi = b[row];
|
||||
for (int col = max_row; col > row; col--)
|
||||
xi -= x[col] * R(row, col);
|
||||
x[row] = xi / R(row, row);
|
||||
}
|
||||
}
|
||||
|
||||
/* create_rotation (used in gmres):
|
||||
* -------------------------------
|
||||
* Construct a Givens rotation to zero out the lowest non-zero entry in a partially
|
||||
* factored Hessenburg matrix. Note that the previous Givens rotations should be
|
||||
* applied to this column before creating a new rotation.
|
||||
*/
|
||||
void create_rotation (const DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
double a = H(col, col);
|
||||
double b = H(col + 1, col);
|
||||
double r;
|
||||
|
||||
if (b == 0) {
|
||||
Cn[col] = copysign(1, a);
|
||||
Sn[col] = 0;
|
||||
}
|
||||
else if (a == 0) {
|
||||
Cn[col] = 0;
|
||||
Sn[col] = copysign(1, b);
|
||||
}
|
||||
else {
|
||||
r = sqrt(a*a + b*b);
|
||||
Sn[col] = -b / r;
|
||||
Cn[col] = a / r;
|
||||
}
|
||||
}
|
||||
|
||||
/* Applies the 'col'th Givens rotation stored in vectors Sn and Cn to the 'col'th
|
||||
* column of the DenseMatrix M. (Previous columns don't need the rotation applied b/c
|
||||
* presumeably, the first col-1 columns are already upper triangular, and so their
|
||||
* entries in the col and col+1 rows are 0.)
|
||||
*/
|
||||
void apply_rotation (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
double c = Cn[col];
|
||||
double s = Sn[col];
|
||||
double tmp = c * H(col, col) - s * H(col+1, col);
|
||||
H(col+1, col) = s * H(col, col) + c * H(col+1, col);
|
||||
H(col, col) = tmp;
|
||||
}
|
||||
|
||||
/* Applies the 'col'th Givens rotation to the vector.
|
||||
*/
|
||||
void apply_rotation (Vector &v, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
double a = v[col];
|
||||
double b = v[col + 1];
|
||||
|
||||
double c = Cn[col];
|
||||
double s = Sn[col];
|
||||
|
||||
v[col] = c * a - s * b;
|
||||
v[col + 1] = s * a + c * b;
|
||||
}
|
||||
|
||||
/* Applies the first 'col' Givens rotations to the newly-created column
|
||||
* of H. (Leaves other columns alone.)
|
||||
*/
|
||||
void update_column (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
for (int i = 0; i < col; i++) {
|
||||
double c = Cn[i];
|
||||
double s = Sn[i];
|
||||
double t = c * H(i,col) - s * H(i+1,col);
|
||||
H(i+1, col) = s * H(i,col) + c * H(i+1,col);
|
||||
H(i, col) = t;
|
||||
}
|
||||
}
|
||||
|
||||
/* After a new column has been added to the hessenburg matrix, factor it back into
|
||||
* an upper-triangular matrix by:
|
||||
* - applying the previous Givens rotations to the new column
|
||||
* - computing the new Givens rotation to make the column upper triangluar
|
||||
* - applying the new Givens rotation to the column, and
|
||||
* - applying the new Givens rotation to the solution vector
|
||||
*/
|
||||
void update_qr_decomp (DenseMatrix &H, Vector &s, size_t col, Vector &Cn, Vector &Sn)
|
||||
{
|
||||
update_column( H, col, Cn, Sn);
|
||||
create_rotation(H, col, Cn, Sn);
|
||||
apply_rotation( H, col, Cn, Sn);
|
||||
apply_rotation( s, col, Cn, Sn);
|
||||
}
|
||||
|
||||
void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double max_err)
|
||||
{
|
||||
DEBUG_PRINT("gmres starting!\n");
|
||||
x.zero();
|
||||
|
||||
ASSERT(A.rows() == A.cols());
|
||||
DenseMatrix Qstar(num_iters + 1, A.rows());
|
||||
DenseMatrix H(num_iters + 1, num_iters);
|
||||
|
||||
// arrays for storing parameters of givens rotations
|
||||
Vector Sn(num_iters);
|
||||
Vector Cn(num_iters);
|
||||
|
||||
// array for storing the rhs projected onto the hessenburg's column space
|
||||
Vector G(num_iters+1);
|
||||
G.zero();
|
||||
|
||||
double beta = b.norm();
|
||||
G[0] = beta;
|
||||
|
||||
// temp vector, stores Aqi
|
||||
Vector w(A.rows());
|
||||
|
||||
w.copy(b);
|
||||
w.normalize();
|
||||
Qstar.set_row(0, w);
|
||||
|
||||
int iter = 0;
|
||||
Vector temp(A.rows(), false);
|
||||
double rel_err;
|
||||
|
||||
while (iter < num_iters)
|
||||
{
|
||||
// w = Aqi
|
||||
Qstar.row(iter, temp);
|
||||
A.multiply(temp, w);
|
||||
|
||||
// construct ith column of H, i+1th row of Qstar:
|
||||
for (int row = 0; row <= iter; row++) {
|
||||
Qstar.row(row, temp);
|
||||
H(row, iter) = temp.dot(w);
|
||||
w.add_ax(-H(row, iter), temp);
|
||||
}
|
||||
|
||||
H(iter+1, iter) = w.norm();
|
||||
w.divide(H(iter+1, iter));
|
||||
Qstar.set_row(iter+1, w);
|
||||
|
||||
update_qr_decomp (H, G, iter, Cn, Sn);
|
||||
|
||||
rel_err = fabs(G[iter+1] / beta);
|
||||
|
||||
if (rel_err < max_err)
|
||||
break;
|
||||
|
||||
if (iter % 100 == 0)
|
||||
DEBUG_PRINT("Iter %d: %f err\n", iter, rel_err);
|
||||
|
||||
iter++;
|
||||
}
|
||||
|
||||
if (iter == num_iters) {
|
||||
fprintf(stderr, "Error: gmres failed to converge in %d iterations (relative err: %f)\n", num_iters, rel_err);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// We've reached an acceptable solution (?):
|
||||
|
||||
DEBUG_PRINT("gmres completed in %d iterations (rel. resid. %f, max %f)\n", num_iters, rel_err, max_err);
|
||||
Vector y(iter+1);
|
||||
upper_triangular_right_solve(H, G, y);
|
||||
for (int i = 0; i < iter + 1; i++) {
|
||||
Qstar.row(i, temp);
|
||||
x.add_ax(y[i], temp);
|
||||
}
|
||||
}
|
||||
50
examples/gmres/algorithm.h
Normal file
50
examples/gmres/algorithm.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __ALGORITHM_H__
|
||||
#define __ALGORITHM_H__
|
||||
|
||||
#include "matrix.h"
|
||||
|
||||
|
||||
/* Generalized Minimal Residual Method:
|
||||
* -----------------------------------
|
||||
* Takes a square matrix and an rhs and uses GMRES to find an estimate for x.
|
||||
* The specified error is relative.
|
||||
*/
|
||||
void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double err);
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
8671
examples/gmres/data/c-18/c-18.mtx
Normal file
8671
examples/gmres/data/c-18/c-18.mtx
Normal file
File diff suppressed because it is too large
Load Diff
2176
examples/gmres/data/c-18/c-18_b.mtx
Normal file
2176
examples/gmres/data/c-18/c-18_b.mtx
Normal file
File diff suppressed because it is too large
Load Diff
17847
examples/gmres/data/c-21/c-21.mtx
Normal file
17847
examples/gmres/data/c-21/c-21.mtx
Normal file
File diff suppressed because it is too large
Load Diff
3516
examples/gmres/data/c-21/c-21_b.mtx
Normal file
3516
examples/gmres/data/c-21/c-21_b.mtx
Normal file
File diff suppressed because it is too large
Load Diff
16346
examples/gmres/data/c-22/c-22.mtx
Normal file
16346
examples/gmres/data/c-22/c-22.mtx
Normal file
File diff suppressed because it is too large
Load Diff
3799
examples/gmres/data/c-22/c-22_b.mtx
Normal file
3799
examples/gmres/data/c-22/c-22_b.mtx
Normal file
File diff suppressed because it is too large
Load Diff
26730
examples/gmres/data/c-25/c-25.mtx
Normal file
26730
examples/gmres/data/c-25/c-25.mtx
Normal file
File diff suppressed because it is too large
Load Diff
3804
examples/gmres/data/c-25/c-25_b.mtx
Normal file
3804
examples/gmres/data/c-25/c-25_b.mtx
Normal file
File diff suppressed because it is too large
Load Diff
55
examples/gmres/debug.h
Normal file
55
examples/gmres/debug.h
Normal file
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __DEBUG_H__
|
||||
#define __DEBUG_H__
|
||||
|
||||
#include <cassert>
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| Macros
|
||||
\**************************************************************/
|
||||
#define DEBUG
|
||||
|
||||
#ifdef DEBUG
|
||||
#define ASSERT(expr) assert(expr)
|
||||
#define DEBUG_PRINT(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define ASSERT(expr)
|
||||
#define DEBUG_PRINT(...)
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
79
examples/gmres/main.cpp
Normal file
79
examples/gmres/main.cpp
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#include "matrix.h"
|
||||
#include "algorithm.h"
|
||||
#include "util.h"
|
||||
#include <cmath>
|
||||
#include "../timing.h"
|
||||
|
||||
|
||||
int main (int argc, char **argv)
|
||||
{
|
||||
if (argc < 4) {
|
||||
printf("usage: %s <input-matrix> <input-rhs> <output-file>\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
double gmres_cycles;
|
||||
|
||||
DEBUG_PRINT("Loading A...\n");
|
||||
Matrix *A = CRSMatrix::matrix_from_mtf(argv[1]);
|
||||
if (A == NULL)
|
||||
return -1;
|
||||
DEBUG_PRINT("... size: %lu\n", A->cols());
|
||||
|
||||
DEBUG_PRINT("Loading b...\n");
|
||||
Vector *b = Vector::vector_from_mtf(argv[2]);
|
||||
if (b == NULL)
|
||||
return -1;
|
||||
|
||||
Vector x(A->cols());
|
||||
DEBUG_PRINT("Beginning gmres...\n");
|
||||
gmres(*A, *b, x, A->cols() / 2, .01);
|
||||
|
||||
// Write result out to file
|
||||
x.to_mtf(argv[argc-1]);
|
||||
|
||||
// Compute residual (double-check)
|
||||
#ifdef DEBUG
|
||||
Vector bprime(b->size());
|
||||
A->multiply(x, bprime);
|
||||
Vector resid(bprime.size(), &(bprime[0]));
|
||||
resid.subtract(*b);
|
||||
DEBUG_PRINT("residual error check: %lg\n", resid.norm() / b->norm());
|
||||
#endif
|
||||
// Print profiling results
|
||||
DEBUG_PRINT("-- Total mcycles to solve : %.03f --\n", gmres_cycles);
|
||||
}
|
||||
246
examples/gmres/matrix.cpp
Normal file
246
examples/gmres/matrix.cpp
Normal file
@@ -0,0 +1,246 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| Includes
|
||||
\**************************************************************/
|
||||
#include "matrix.h"
|
||||
#include "matrix_ispc.h"
|
||||
|
||||
extern "C" {
|
||||
#include "mmio.h"
|
||||
}
|
||||
|
||||
/**************************************************************\
|
||||
| DenseMatrix methods
|
||||
\**************************************************************/
|
||||
void DenseMatrix::multiply (const Vector &v, Vector &r) const
|
||||
{
|
||||
// Dimensionality check
|
||||
ASSERT(v.size() == cols());
|
||||
ASSERT(r.size() == rows());
|
||||
|
||||
for (int i = 0; i < rows(); i++)
|
||||
r[i] = v.dot(entries + i * num_cols);
|
||||
}
|
||||
|
||||
const Vector *DenseMatrix::row (size_t row) const {
|
||||
return new Vector(num_cols, entries + row * num_cols, true);
|
||||
}
|
||||
|
||||
void DenseMatrix::row (size_t row, Vector &r) {
|
||||
r.entries = entries + row * cols();
|
||||
r._size = cols();
|
||||
}
|
||||
|
||||
void DenseMatrix::set_row(size_t row, const Vector &v)
|
||||
{
|
||||
ASSERT(v.size() == num_cols);
|
||||
memcpy(entries + row * num_cols, v.entries, num_cols * sizeof(double));
|
||||
}
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| CRSMatrix Methods
|
||||
\**************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
struct entry {
|
||||
int row;
|
||||
int col;
|
||||
double val;
|
||||
};
|
||||
|
||||
bool compare_entries(struct entry i, struct entry j) {
|
||||
if (i.row < j.row)
|
||||
return true;
|
||||
if (i.row > j.row)
|
||||
return false;
|
||||
|
||||
return i.col < j.col;
|
||||
}
|
||||
|
||||
#define ERR_OUT(...) { fprintf(stderr, __VA_ARGS__); return NULL; }
|
||||
|
||||
CRSMatrix *CRSMatrix::matrix_from_mtf (char *path) {
|
||||
FILE *f;
|
||||
MM_typecode matcode;
|
||||
|
||||
int m, n, nz;
|
||||
|
||||
if ((f = fopen(path, "r")) == NULL)
|
||||
ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
|
||||
|
||||
if (mm_read_banner(f, &matcode) != 0)
|
||||
ERR_OUT("Error: Could not process Matrix Market banner.\n");
|
||||
|
||||
if (mm_is_complex(matcode))
|
||||
ERR_OUT("Error: Application does not support complex numbers.\n")
|
||||
|
||||
if (mm_is_dense(matcode))
|
||||
ERR_OUT("Error: supplied matrix is dense (should be sparse.)\n");
|
||||
|
||||
if (!mm_is_matrix(matcode))
|
||||
ERR_OUT("Error: %s does not encode a matrix.\n", path)
|
||||
|
||||
if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
|
||||
ERR_OUT("Error: could not read matrix size from file.\n");
|
||||
|
||||
if (m != n)
|
||||
ERR_OUT("Error: Application does not support non-square matrices.");
|
||||
|
||||
std::vector<struct entry> entries;
|
||||
entries.resize(nz);
|
||||
|
||||
for (int i = 0; i < nz; i++) {
|
||||
fscanf(f, "%d %d %lg\n", &entries[i].row, &entries[i].col, &entries[i].val);
|
||||
// Adjust from 1-based to 0-based
|
||||
entries[i].row--;
|
||||
entries[i].col--;
|
||||
}
|
||||
|
||||
sort(entries.begin(), entries.end(), compare_entries);
|
||||
|
||||
CRSMatrix *M = new CRSMatrix(m, n, nz);
|
||||
int cur_row = -1;
|
||||
for (int i = 0; i < nz; i++) {
|
||||
while (entries[i].row > cur_row)
|
||||
M->row_offsets[++cur_row] = i;
|
||||
M->entries[i] = entries[i].val;
|
||||
M->columns[i] = entries[i].col;
|
||||
}
|
||||
|
||||
return M;
|
||||
}
|
||||
|
||||
Vector *Vector::vector_from_mtf (char *path) {
|
||||
FILE *f;
|
||||
MM_typecode matcode;
|
||||
|
||||
int m, n, nz;
|
||||
|
||||
if ((f = fopen(path, "r")) == NULL)
|
||||
ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
|
||||
|
||||
if (mm_read_banner(f, &matcode) != 0)
|
||||
ERR_OUT("Error: Could not process Matrix Market banner.\n");
|
||||
|
||||
if (mm_is_complex(matcode))
|
||||
ERR_OUT("Error: Application does not support complex numbers.\n")
|
||||
|
||||
if (mm_is_dense(matcode)) {
|
||||
if (mm_read_mtx_array_size(f, &m, &n) != 0)
|
||||
ERR_OUT("Error: could not read matrix size from file.\n");
|
||||
} else {
|
||||
if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
|
||||
ERR_OUT("Error: could not read matrix size from file.\n");
|
||||
}
|
||||
if (n != 1)
|
||||
ERR_OUT("Error: %s does not describe a vector.\n", path);
|
||||
|
||||
Vector *x = new Vector(m);
|
||||
|
||||
if (mm_is_dense(matcode)) {
|
||||
double val;
|
||||
for (int i = 0; i < m; i++) {
|
||||
fscanf(f, "%lg\n", &val);
|
||||
(*x)[i] = val;
|
||||
}
|
||||
}
|
||||
else {
|
||||
x->zero();
|
||||
double val;
|
||||
int row;
|
||||
int col;
|
||||
for (int i = 0; i < nz; i++) {
|
||||
fscanf(f, "%d %d %lg\n", &row, &col, &val);
|
||||
(*x)[row-1] = val;
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
#define ERR(...) { fprintf(stderr, __VA_ARGS__); exit(-1); }
|
||||
|
||||
void Vector::to_mtf (char *path) {
|
||||
FILE *f;
|
||||
MM_typecode matcode;
|
||||
|
||||
mm_initialize_typecode(&matcode);
|
||||
mm_set_matrix(&matcode);
|
||||
mm_set_real(&matcode);
|
||||
mm_set_dense(&matcode);
|
||||
mm_set_general(&matcode);
|
||||
|
||||
if ((f = fopen(path, "w")) == NULL)
|
||||
ERR("Error: cannot open/write to %s\n", path);
|
||||
|
||||
mm_write_banner(f, matcode);
|
||||
mm_write_mtx_array_size(f, size(), 1);
|
||||
for (int i = 0; i < size(); i++)
|
||||
fprintf(f, "%lg\n", entries[i]);
|
||||
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
void CRSMatrix::multiply (const Vector &v, Vector &r) const
|
||||
{
|
||||
ASSERT(v.size() == cols());
|
||||
ASSERT(r.size() == rows());
|
||||
|
||||
for (int row = 0; row < rows(); row++)
|
||||
{
|
||||
int row_offset = row_offsets[row];
|
||||
int next_offset = ((row + 1 == rows()) ? _nonzeroes : row_offsets[row + 1]);
|
||||
|
||||
double sum = 0;
|
||||
for (int i = row_offset; i < next_offset; i++)
|
||||
{
|
||||
sum += v[columns[i]] * entries[i];
|
||||
}
|
||||
r[row] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
void CRSMatrix::zero ( )
|
||||
{
|
||||
entries.clear();
|
||||
row_offsets.clear();
|
||||
columns.clear();
|
||||
_nonzeroes = 0;
|
||||
}
|
||||
279
examples/gmres/matrix.h
Normal file
279
examples/gmres/matrix.h
Normal file
@@ -0,0 +1,279 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __MATRIX_H__
|
||||
#define __MATRIX_H__
|
||||
|
||||
/**************************************************************\
|
||||
| Includes
|
||||
\**************************************************************/
|
||||
#include <cstring> // size_t
|
||||
#include <cstdlib> // malloc, memcpy, etc.
|
||||
#include <cmath> // sqrt
|
||||
#include <vector>
|
||||
|
||||
#include "debug.h"
|
||||
#include "matrix_ispc.h"
|
||||
|
||||
|
||||
class DenseMatrix;
|
||||
/**************************************************************\
|
||||
| Vector class
|
||||
\**************************************************************/
|
||||
class Vector {
|
||||
public:
|
||||
static Vector *vector_from_mtf(char *path);
|
||||
void to_mtf (char *path);
|
||||
|
||||
Vector(size_t size, bool alloc_mem=true)
|
||||
{
|
||||
shared_ptr = false;
|
||||
_size = size;
|
||||
|
||||
if (alloc_mem)
|
||||
entries = (double *) malloc(sizeof(double) * _size);
|
||||
else {
|
||||
shared_ptr = true;
|
||||
entries = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
Vector(size_t size, double *content, bool share_ptr=false)
|
||||
{
|
||||
_size = size;
|
||||
if (share_ptr) {
|
||||
entries = content;
|
||||
shared_ptr = true;
|
||||
}
|
||||
else {
|
||||
shared_ptr = false;
|
||||
entries = (double *) malloc(sizeof(double) * _size);
|
||||
memcpy(entries, content, sizeof(double) * _size);
|
||||
}
|
||||
}
|
||||
|
||||
~Vector() { if (!shared_ptr) free(entries); }
|
||||
|
||||
const double & operator [] (size_t index) const
|
||||
{
|
||||
ASSERT(index < _size);
|
||||
return *(entries + index);
|
||||
}
|
||||
|
||||
double &operator [] (size_t index)
|
||||
{
|
||||
ASSERT(index < _size);
|
||||
return *(entries + index);
|
||||
}
|
||||
|
||||
bool operator == (const Vector &v) const
|
||||
{
|
||||
if (v.size() != _size)
|
||||
return false;
|
||||
|
||||
for (int i = 0; i < _size; i++)
|
||||
if (entries[i] != v[i])
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t size() const {return _size; }
|
||||
|
||||
double dot (const Vector &b) const
|
||||
{
|
||||
ASSERT(b.size() == this->size());
|
||||
return ispc::vector_dot(entries, b.entries, size());
|
||||
}
|
||||
|
||||
double dot (const double * const b) const
|
||||
{
|
||||
return ispc::vector_dot(entries, b, size());
|
||||
}
|
||||
|
||||
void zero ()
|
||||
{
|
||||
ispc::zero(entries, size());
|
||||
}
|
||||
|
||||
double norm () const { return sqrtf(dot(entries)); }
|
||||
|
||||
void normalize () { this->divide(this->norm()); }
|
||||
|
||||
void add (const Vector &a)
|
||||
{
|
||||
ASSERT(size() == a.size());
|
||||
ispc::vector_add(entries, a.entries, size());
|
||||
}
|
||||
|
||||
void subtract (const Vector &s)
|
||||
{
|
||||
ASSERT(size() == s.size());
|
||||
ispc::vector_sub(entries, s.entries, size());
|
||||
}
|
||||
|
||||
void multiply (double scalar)
|
||||
{
|
||||
ispc::vector_mult(entries, scalar, size());
|
||||
}
|
||||
|
||||
void divide (double scalar)
|
||||
{
|
||||
ispc::vector_div(entries, scalar, size());
|
||||
}
|
||||
|
||||
// Note: x may be longer than *(this)
|
||||
void add_ax (double a, const Vector &x) {
|
||||
ASSERT(x.size() >= size());
|
||||
ispc::vector_add_ax(entries, a, x.entries, size());
|
||||
}
|
||||
|
||||
// Note that copy only copies the first size() elements of the
|
||||
// supplied vector, i.e. the supplied vector can be longer than
|
||||
// this one. This is useful in least squares calculations.
|
||||
void copy (const Vector &other) {
|
||||
ASSERT(other.size() >= size());
|
||||
memcpy(entries, other.entries, size() * sizeof(double));
|
||||
}
|
||||
|
||||
friend class DenseMatrix;
|
||||
|
||||
private:
|
||||
size_t _size;
|
||||
bool shared_ptr;
|
||||
double *entries;
|
||||
};
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| Matrix base class
|
||||
\**************************************************************/
|
||||
class Matrix {
|
||||
friend class Vector;
|
||||
|
||||
public:
|
||||
Matrix(size_t size_r, size_t size_c)
|
||||
{
|
||||
num_rows = size_r;
|
||||
num_cols = size_c;
|
||||
}
|
||||
~Matrix(){}
|
||||
|
||||
size_t rows() const { return num_rows; }
|
||||
size_t cols() const { return num_cols; }
|
||||
|
||||
virtual void multiply (const Vector &v, Vector &r) const = 0;
|
||||
virtual void zero () = 0;
|
||||
|
||||
protected:
|
||||
size_t num_rows;
|
||||
size_t num_cols;
|
||||
};
|
||||
|
||||
/**************************************************************\
|
||||
| DenseMatrix class
|
||||
\**************************************************************/
|
||||
class DenseMatrix : public Matrix {
|
||||
friend class Vector;
|
||||
|
||||
public:
|
||||
DenseMatrix(size_t size_r, size_t size_c) : Matrix(size_r, size_c)
|
||||
{
|
||||
entries = (double *) malloc(size_r * size_c * sizeof(double));
|
||||
}
|
||||
|
||||
DenseMatrix(size_t size_r, size_t size_c, const double *content) : Matrix (size_r, size_c)
|
||||
{
|
||||
entries = (double *) malloc(size_r * size_c * sizeof(double));
|
||||
memcpy(entries, content, size_r * size_c * sizeof(double));
|
||||
}
|
||||
|
||||
virtual void multiply (const Vector &v, Vector &r) const;
|
||||
|
||||
double &operator () (unsigned int r, unsigned int c)
|
||||
{
|
||||
return *(entries + r * num_cols + c);
|
||||
}
|
||||
|
||||
const double &operator () (unsigned int r, unsigned int c) const
|
||||
{
|
||||
return *(entries + r * num_cols + c);
|
||||
}
|
||||
|
||||
const Vector *row(size_t row) const;
|
||||
void row(size_t row, Vector &r);
|
||||
void set_row(size_t row, const Vector &v);
|
||||
|
||||
virtual void zero() { ispc::zero(entries, rows() * cols()); }
|
||||
|
||||
void copy (const DenseMatrix &other)
|
||||
{
|
||||
ASSERT(rows() == other.rows());
|
||||
ASSERT(cols() == other.cols());
|
||||
memcpy(entries, other.entries, rows() * cols() * sizeof(double));
|
||||
}
|
||||
|
||||
private:
|
||||
double *entries;
|
||||
bool shared_ptr;
|
||||
};
|
||||
|
||||
/**************************************************************\
|
||||
| CSRMatrix (compressed row storage, a sparse matrix format)
|
||||
\**************************************************************/
|
||||
class CRSMatrix : public Matrix {
|
||||
public:
|
||||
CRSMatrix (size_t size_r, size_t size_c, size_t nonzeroes) :
|
||||
Matrix(size_r, size_c)
|
||||
{
|
||||
_nonzeroes = nonzeroes;
|
||||
entries.resize(nonzeroes);
|
||||
columns.resize(nonzeroes);
|
||||
row_offsets.resize(size_r);
|
||||
}
|
||||
|
||||
virtual void multiply(const Vector &v, Vector &r) const;
|
||||
|
||||
virtual void zero();
|
||||
|
||||
static CRSMatrix *matrix_from_mtf (char *path);
|
||||
|
||||
private:
|
||||
unsigned int _nonzeroes;
|
||||
std::vector<double> entries;
|
||||
std::vector<int> row_offsets;
|
||||
std::vector<int> columns;
|
||||
};
|
||||
|
||||
#endif
|
||||
122
examples/gmres/matrix.ispc
Normal file
122
examples/gmres/matrix.ispc
Normal file
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| General
|
||||
\**************************************************************/
|
||||
export void zero (uniform double data[],
|
||||
uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
data[i] = 0.0;
|
||||
}
|
||||
|
||||
|
||||
/**************************************************************\
|
||||
| Vector helpers
|
||||
\**************************************************************/
|
||||
export void vector_add (uniform double a[],
|
||||
const uniform double b[],
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
a[i] += b[i];
|
||||
}
|
||||
|
||||
export void vector_sub (uniform double a[],
|
||||
const uniform double b[],
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
a[i] -= b[i];
|
||||
}
|
||||
|
||||
export void vector_mult (uniform double a[],
|
||||
const uniform double b,
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
a[i] *= b;
|
||||
}
|
||||
|
||||
export void vector_div (uniform double a[],
|
||||
const uniform double b,
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
a[i] /= b;
|
||||
}
|
||||
|
||||
export void vector_add_ax (uniform double r[],
|
||||
const uniform double a,
|
||||
const uniform double x[],
|
||||
const uniform int size)
|
||||
{
|
||||
foreach (i = 0 ... size)
|
||||
r[i] += a * x[i];
|
||||
}
|
||||
|
||||
export uniform double vector_dot (const uniform double a[],
|
||||
const uniform double b[],
|
||||
const uniform int size)
|
||||
{
|
||||
varying double sum = 0.0;
|
||||
foreach (i = 0 ... size)
|
||||
sum += a[i] * b[i];
|
||||
return reduce_add(sum);
|
||||
}
|
||||
|
||||
/**************************************************************\
|
||||
| Matrix helpers
|
||||
\**************************************************************/
|
||||
export void sparse_multiply (const uniform double entries[],
|
||||
const uniform double columns[],
|
||||
const uniform double row_offsets[],
|
||||
const uniform int rows,
|
||||
const uniform int cols,
|
||||
const uniform int nonzeroes,
|
||||
const uniform double v[],
|
||||
uniform double r[])
|
||||
{
|
||||
foreach (row = 0 ... rows) {
|
||||
int row_offset = row_offsets[row];
|
||||
int next_offset = ((row + 1 == rows) ? nonzeroes : row_offsets[row+1]);
|
||||
|
||||
double sum = 0;
|
||||
for (int j = row_offset; j < next_offset; j++)
|
||||
sum += v[columns[j]] * entries[j];
|
||||
r[row] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
511
examples/gmres/mmio.c
Normal file
511
examples/gmres/mmio.c
Normal file
@@ -0,0 +1,511 @@
|
||||
/*
|
||||
* Matrix Market I/O library for ANSI C
|
||||
*
|
||||
* See http://math.nist.gov/MatrixMarket for details.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "mmio.h"
|
||||
|
||||
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
|
||||
double **val_, int **I_, int **J_)
|
||||
{
|
||||
FILE *f;
|
||||
MM_typecode matcode;
|
||||
int M, N, nz;
|
||||
int i;
|
||||
double *val;
|
||||
int *I, *J;
|
||||
|
||||
if ((f = fopen(fname, "r")) == NULL)
|
||||
return -1;
|
||||
|
||||
|
||||
if (mm_read_banner(f, &matcode) != 0)
|
||||
{
|
||||
printf("mm_read_unsymetric: Could not process Matrix Market banner ");
|
||||
printf(" in file [%s]\n", fname);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
|
||||
mm_is_sparse(matcode)))
|
||||
{
|
||||
fprintf(stderr, "Sorry, this application does not support ");
|
||||
fprintf(stderr, "Market Market type: [%s]\n",
|
||||
mm_typecode_to_str(matcode));
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* find out size of sparse matrix: M, N, nz .... */
|
||||
|
||||
if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
|
||||
{
|
||||
fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
*M_ = M;
|
||||
*N_ = N;
|
||||
*nz_ = nz;
|
||||
|
||||
/* reseve memory for matrices */
|
||||
|
||||
I = (int *) malloc(nz * sizeof(int));
|
||||
J = (int *) malloc(nz * sizeof(int));
|
||||
val = (double *) malloc(nz * sizeof(double));
|
||||
|
||||
*val_ = val;
|
||||
*I_ = I;
|
||||
*J_ = J;
|
||||
|
||||
/* NOTE: when reading in doubles, ANSI C requires the use of the "l" */
|
||||
/* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
|
||||
/* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */
|
||||
|
||||
for (i=0; i<nz; i++)
|
||||
{
|
||||
fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
|
||||
I[i]--; /* adjust from 1-based to 0-based */
|
||||
J[i]--;
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_is_valid(MM_typecode matcode)
|
||||
{
|
||||
if (!mm_is_matrix(matcode)) return 0;
|
||||
if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
|
||||
if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
|
||||
if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) ||
|
||||
mm_is_skew(matcode))) return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int mm_read_banner(FILE *f, MM_typecode *matcode)
|
||||
{
|
||||
char line[MM_MAX_LINE_LENGTH];
|
||||
char banner[MM_MAX_TOKEN_LENGTH];
|
||||
char mtx[MM_MAX_TOKEN_LENGTH];
|
||||
char crd[MM_MAX_TOKEN_LENGTH];
|
||||
char data_type[MM_MAX_TOKEN_LENGTH];
|
||||
char storage_scheme[MM_MAX_TOKEN_LENGTH];
|
||||
char *p;
|
||||
|
||||
|
||||
mm_clear_typecode(matcode);
|
||||
|
||||
if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
|
||||
return MM_PREMATURE_EOF;
|
||||
|
||||
if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type,
|
||||
storage_scheme) != 5)
|
||||
return MM_PREMATURE_EOF;
|
||||
|
||||
for (p=mtx; *p!='\0'; *p=tolower(*p),p++); /* convert to lower case */
|
||||
for (p=crd; *p!='\0'; *p=tolower(*p),p++);
|
||||
for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
|
||||
for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
|
||||
|
||||
/* check for banner */
|
||||
if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
|
||||
return MM_NO_HEADER;
|
||||
|
||||
/* first field should be "mtx" */
|
||||
if (strcmp(mtx, MM_MTX_STR) != 0)
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
mm_set_matrix(matcode);
|
||||
|
||||
|
||||
/* second field describes whether this is a sparse matrix (in coordinate
|
||||
storgae) or a dense array */
|
||||
|
||||
|
||||
if (strcmp(crd, MM_SPARSE_STR) == 0)
|
||||
mm_set_sparse(matcode);
|
||||
else
|
||||
if (strcmp(crd, MM_DENSE_STR) == 0)
|
||||
mm_set_dense(matcode);
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
|
||||
/* third field */
|
||||
|
||||
if (strcmp(data_type, MM_REAL_STR) == 0)
|
||||
mm_set_real(matcode);
|
||||
else
|
||||
if (strcmp(data_type, MM_COMPLEX_STR) == 0)
|
||||
mm_set_complex(matcode);
|
||||
else
|
||||
if (strcmp(data_type, MM_PATTERN_STR) == 0)
|
||||
mm_set_pattern(matcode);
|
||||
else
|
||||
if (strcmp(data_type, MM_INT_STR) == 0)
|
||||
mm_set_integer(matcode);
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
|
||||
/* fourth field */
|
||||
|
||||
if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
|
||||
mm_set_general(matcode);
|
||||
else
|
||||
if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
|
||||
mm_set_symmetric(matcode);
|
||||
else
|
||||
if (strcmp(storage_scheme, MM_HERM_STR) == 0)
|
||||
mm_set_hermitian(matcode);
|
||||
else
|
||||
if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
|
||||
mm_set_skew(matcode);
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
|
||||
{
|
||||
if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
|
||||
return MM_COULD_NOT_WRITE_FILE;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
|
||||
{
|
||||
char line[MM_MAX_LINE_LENGTH];
|
||||
int num_items_read;
|
||||
|
||||
/* set return null parameter values, in case we exit with errors */
|
||||
*M = *N = *nz = 0;
|
||||
|
||||
/* now continue scanning until you reach the end-of-comments */
|
||||
do
|
||||
{
|
||||
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
|
||||
return MM_PREMATURE_EOF;
|
||||
}while (line[0] == '%');
|
||||
|
||||
/* line[] is either blank or has M,N, nz */
|
||||
if (sscanf(line, "%d %d %d", M, N, nz) == 3)
|
||||
return 0;
|
||||
|
||||
else
|
||||
do
|
||||
{
|
||||
num_items_read = fscanf(f, "%d %d %d", M, N, nz);
|
||||
if (num_items_read == EOF) return MM_PREMATURE_EOF;
|
||||
}
|
||||
while (num_items_read != 3);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int mm_read_mtx_array_size(FILE *f, int *M, int *N)
|
||||
{
|
||||
char line[MM_MAX_LINE_LENGTH];
|
||||
int num_items_read;
|
||||
/* set return null parameter values, in case we exit with errors */
|
||||
*M = *N = 0;
|
||||
|
||||
/* now continue scanning until you reach the end-of-comments */
|
||||
do
|
||||
{
|
||||
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
|
||||
return MM_PREMATURE_EOF;
|
||||
}while (line[0] == '%');
|
||||
|
||||
/* line[] is either blank or has M,N, nz */
|
||||
if (sscanf(line, "%d %d", M, N) == 2)
|
||||
return 0;
|
||||
|
||||
else /* we have a blank line */
|
||||
do
|
||||
{
|
||||
num_items_read = fscanf(f, "%d %d", M, N);
|
||||
if (num_items_read == EOF) return MM_PREMATURE_EOF;
|
||||
}
|
||||
while (num_items_read != 2);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_write_mtx_array_size(FILE *f, int M, int N)
|
||||
{
|
||||
if (fprintf(f, "%d %d\n", M, N) != 2)
|
||||
return MM_COULD_NOT_WRITE_FILE;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*-------------------------------------------------------------------------*/
|
||||
|
||||
/******************************************************************/
|
||||
/* use when I[], J[], and val[]J, and val[] are already allocated */
|
||||
/******************************************************************/
|
||||
|
||||
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
|
||||
double val[], MM_typecode matcode)
|
||||
{
|
||||
int i;
|
||||
if (mm_is_complex(matcode))
|
||||
{
|
||||
for (i=0; i<nz; i++)
|
||||
if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
|
||||
!= 4) return MM_PREMATURE_EOF;
|
||||
}
|
||||
else if (mm_is_real(matcode))
|
||||
{
|
||||
for (i=0; i<nz; i++)
|
||||
{
|
||||
if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
|
||||
!= 3) return MM_PREMATURE_EOF;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
else if (mm_is_pattern(matcode))
|
||||
{
|
||||
for (i=0; i<nz; i++)
|
||||
if (fscanf(f, "%d %d", &I[i], &J[i])
|
||||
!= 2) return MM_PREMATURE_EOF;
|
||||
}
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
|
||||
double *real, double *imag, MM_typecode matcode)
|
||||
{
|
||||
if (mm_is_complex(matcode))
|
||||
{
|
||||
if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
|
||||
!= 4) return MM_PREMATURE_EOF;
|
||||
}
|
||||
else if (mm_is_real(matcode))
|
||||
{
|
||||
if (fscanf(f, "%d %d %lg\n", I, J, real)
|
||||
!= 3) return MM_PREMATURE_EOF;
|
||||
|
||||
}
|
||||
|
||||
else if (mm_is_pattern(matcode))
|
||||
{
|
||||
if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
|
||||
}
|
||||
else
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/************************************************************************
|
||||
mm_read_mtx_crd() fills M, N, nz, array of values, and return
|
||||
type code, e.g. 'MCRS'
|
||||
|
||||
if matrix is complex, values[] is of size 2*nz,
|
||||
(nz pairs of real/imaginary values)
|
||||
************************************************************************/
|
||||
|
||||
int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
|
||||
double **val, MM_typecode *matcode)
|
||||
{
|
||||
int ret_code;
|
||||
FILE *f;
|
||||
|
||||
if (strcmp(fname, "stdin") == 0) f=stdin;
|
||||
else
|
||||
if ((f = fopen(fname, "r")) == NULL)
|
||||
return MM_COULD_NOT_READ_FILE;
|
||||
|
||||
|
||||
if ((ret_code = mm_read_banner(f, matcode)) != 0)
|
||||
return ret_code;
|
||||
|
||||
if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) &&
|
||||
mm_is_matrix(*matcode)))
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
|
||||
if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
|
||||
return ret_code;
|
||||
|
||||
|
||||
*I = (int *) malloc(*nz * sizeof(int));
|
||||
*J = (int *) malloc(*nz * sizeof(int));
|
||||
*val = NULL;
|
||||
|
||||
if (mm_is_complex(*matcode))
|
||||
{
|
||||
*val = (double *) malloc(*nz * 2 * sizeof(double));
|
||||
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
|
||||
*matcode);
|
||||
if (ret_code != 0) return ret_code;
|
||||
}
|
||||
else if (mm_is_real(*matcode))
|
||||
{
|
||||
*val = (double *) malloc(*nz * sizeof(double));
|
||||
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
|
||||
*matcode);
|
||||
if (ret_code != 0) return ret_code;
|
||||
}
|
||||
|
||||
else if (mm_is_pattern(*matcode))
|
||||
{
|
||||
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
|
||||
*matcode);
|
||||
if (ret_code != 0) return ret_code;
|
||||
}
|
||||
|
||||
if (f != stdin) fclose(f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_write_banner(FILE *f, MM_typecode matcode)
|
||||
{
|
||||
char *str = mm_typecode_to_str(matcode);
|
||||
int ret_code;
|
||||
|
||||
ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
|
||||
free(str);
|
||||
if (ret_code !=2 )
|
||||
return MM_COULD_NOT_WRITE_FILE;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
|
||||
double val[], MM_typecode matcode)
|
||||
{
|
||||
FILE *f;
|
||||
int i;
|
||||
|
||||
if (strcmp(fname, "stdout") == 0)
|
||||
f = stdout;
|
||||
else
|
||||
if ((f = fopen(fname, "w")) == NULL)
|
||||
return MM_COULD_NOT_WRITE_FILE;
|
||||
|
||||
/* print banner followed by typecode */
|
||||
fprintf(f, "%s ", MatrixMarketBanner);
|
||||
fprintf(f, "%s\n", mm_typecode_to_str(matcode));
|
||||
|
||||
/* print matrix sizes and nonzeros */
|
||||
fprintf(f, "%d %d %d\n", M, N, nz);
|
||||
|
||||
/* print values */
|
||||
if (mm_is_pattern(matcode))
|
||||
for (i=0; i<nz; i++)
|
||||
fprintf(f, "%d %d\n", I[i], J[i]);
|
||||
else
|
||||
if (mm_is_real(matcode))
|
||||
for (i=0; i<nz; i++)
|
||||
fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
|
||||
else
|
||||
if (mm_is_complex(matcode))
|
||||
for (i=0; i<nz; i++)
|
||||
fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i],
|
||||
val[2*i+1]);
|
||||
else
|
||||
{
|
||||
if (f != stdout) fclose(f);
|
||||
return MM_UNSUPPORTED_TYPE;
|
||||
}
|
||||
|
||||
if (f !=stdout) fclose(f);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a new copy of a string s. mm_strdup() is a common routine, but
|
||||
* not part of ANSI C, so it is included here. Used by mm_typecode_to_str().
|
||||
*
|
||||
*/
|
||||
char *mm_strdup(const char *s)
|
||||
{
|
||||
int len = strlen(s);
|
||||
char *s2 = (char *) malloc((len+1)*sizeof(char));
|
||||
return strcpy(s2, s);
|
||||
}
|
||||
|
||||
char *mm_typecode_to_str(MM_typecode matcode)
|
||||
{
|
||||
char buffer[MM_MAX_LINE_LENGTH];
|
||||
char *types[4];
|
||||
char *mm_strdup(const char *);
|
||||
int error =0;
|
||||
|
||||
/* check for MTX type */
|
||||
if (mm_is_matrix(matcode))
|
||||
types[0] = MM_MTX_STR;
|
||||
else
|
||||
error=1;
|
||||
|
||||
/* check for CRD or ARR matrix */
|
||||
if (mm_is_sparse(matcode))
|
||||
types[1] = MM_SPARSE_STR;
|
||||
else
|
||||
if (mm_is_dense(matcode))
|
||||
types[1] = MM_DENSE_STR;
|
||||
else
|
||||
return NULL;
|
||||
|
||||
/* check for element data type */
|
||||
if (mm_is_real(matcode))
|
||||
types[2] = MM_REAL_STR;
|
||||
else
|
||||
if (mm_is_complex(matcode))
|
||||
types[2] = MM_COMPLEX_STR;
|
||||
else
|
||||
if (mm_is_pattern(matcode))
|
||||
types[2] = MM_PATTERN_STR;
|
||||
else
|
||||
if (mm_is_integer(matcode))
|
||||
types[2] = MM_INT_STR;
|
||||
else
|
||||
return NULL;
|
||||
|
||||
|
||||
/* check for symmetry type */
|
||||
if (mm_is_general(matcode))
|
||||
types[3] = MM_GENERAL_STR;
|
||||
else
|
||||
if (mm_is_symmetric(matcode))
|
||||
types[3] = MM_SYMM_STR;
|
||||
else
|
||||
if (mm_is_hermitian(matcode))
|
||||
types[3] = MM_HERM_STR;
|
||||
else
|
||||
if (mm_is_skew(matcode))
|
||||
types[3] = MM_SKEW_STR;
|
||||
else
|
||||
return NULL;
|
||||
|
||||
sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
|
||||
return mm_strdup(buffer);
|
||||
|
||||
}
|
||||
135
examples/gmres/mmio.h
Normal file
135
examples/gmres/mmio.h
Normal file
@@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Matrix Market I/O library for ANSI C
|
||||
*
|
||||
* See http://math.nist.gov/MatrixMarket for details.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MM_IO_H
|
||||
#define MM_IO_H
|
||||
|
||||
#define MM_MAX_LINE_LENGTH 1025
|
||||
#define MatrixMarketBanner "%%MatrixMarket"
|
||||
#define MM_MAX_TOKEN_LENGTH 64
|
||||
|
||||
typedef char MM_typecode[4];
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
char *mm_typecode_to_str(MM_typecode matcode);
|
||||
|
||||
int mm_read_banner(FILE *f, MM_typecode *matcode);
|
||||
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
|
||||
int mm_read_mtx_array_size(FILE *f, int *M, int *N);
|
||||
|
||||
int mm_write_banner(FILE *f, MM_typecode matcode);
|
||||
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
|
||||
int mm_write_mtx_array_size(FILE *f, int M, int N);
|
||||
|
||||
|
||||
/********************* MM_typecode query fucntions ***************************/
|
||||
|
||||
#define mm_is_matrix(typecode) ((typecode)[0]=='M')
|
||||
|
||||
#define mm_is_sparse(typecode) ((typecode)[1]=='C')
|
||||
#define mm_is_coordinate(typecode)((typecode)[1]=='C')
|
||||
#define mm_is_dense(typecode) ((typecode)[1]=='A')
|
||||
#define mm_is_array(typecode) ((typecode)[1]=='A')
|
||||
|
||||
#define mm_is_complex(typecode) ((typecode)[2]=='C')
|
||||
#define mm_is_real(typecode) ((typecode)[2]=='R')
|
||||
#define mm_is_pattern(typecode) ((typecode)[2]=='P')
|
||||
#define mm_is_integer(typecode) ((typecode)[2]=='I')
|
||||
|
||||
#define mm_is_symmetric(typecode)((typecode)[3]=='S')
|
||||
#define mm_is_general(typecode) ((typecode)[3]=='G')
|
||||
#define mm_is_skew(typecode) ((typecode)[3]=='K')
|
||||
#define mm_is_hermitian(typecode)((typecode)[3]=='H')
|
||||
|
||||
int mm_is_valid(MM_typecode matcode); /* too complex for a macro */
|
||||
|
||||
|
||||
/********************* MM_typecode modify fucntions ***************************/
|
||||
|
||||
#define mm_set_matrix(typecode) ((*typecode)[0]='M')
|
||||
#define mm_set_coordinate(typecode) ((*typecode)[1]='C')
|
||||
#define mm_set_array(typecode) ((*typecode)[1]='A')
|
||||
#define mm_set_dense(typecode) mm_set_array(typecode)
|
||||
#define mm_set_sparse(typecode) mm_set_coordinate(typecode)
|
||||
|
||||
#define mm_set_complex(typecode)((*typecode)[2]='C')
|
||||
#define mm_set_real(typecode) ((*typecode)[2]='R')
|
||||
#define mm_set_pattern(typecode)((*typecode)[2]='P')
|
||||
#define mm_set_integer(typecode)((*typecode)[2]='I')
|
||||
|
||||
|
||||
#define mm_set_symmetric(typecode)((*typecode)[3]='S')
|
||||
#define mm_set_general(typecode)((*typecode)[3]='G')
|
||||
#define mm_set_skew(typecode) ((*typecode)[3]='K')
|
||||
#define mm_set_hermitian(typecode)((*typecode)[3]='H')
|
||||
|
||||
#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
|
||||
(*typecode)[2]=' ',(*typecode)[3]='G')
|
||||
|
||||
#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
|
||||
|
||||
|
||||
/********************* Matrix Market error codes ***************************/
|
||||
|
||||
|
||||
#define MM_COULD_NOT_READ_FILE 11
|
||||
#define MM_PREMATURE_EOF 12
|
||||
#define MM_NOT_MTX 13
|
||||
#define MM_NO_HEADER 14
|
||||
#define MM_UNSUPPORTED_TYPE 15
|
||||
#define MM_LINE_TOO_LONG 16
|
||||
#define MM_COULD_NOT_WRITE_FILE 17
|
||||
|
||||
|
||||
/******************** Matrix Market internal definitions ********************
|
||||
|
||||
MM_matrix_typecode: 4-character sequence
|
||||
|
||||
ojbect sparse/ data storage
|
||||
dense type scheme
|
||||
|
||||
string position: [0] [1] [2] [3]
|
||||
|
||||
Matrix typecode: M(atrix) C(oord) R(eal) G(eneral)
|
||||
A(array) C(omplex) H(ermitian)
|
||||
P(attern) S(ymmetric)
|
||||
I(nteger) K(kew)
|
||||
|
||||
***********************************************************************/
|
||||
|
||||
#define MM_MTX_STR "matrix"
|
||||
#define MM_ARRAY_STR "array"
|
||||
#define MM_DENSE_STR "array"
|
||||
#define MM_COORDINATE_STR "coordinate"
|
||||
#define MM_SPARSE_STR "coordinate"
|
||||
#define MM_COMPLEX_STR "complex"
|
||||
#define MM_REAL_STR "real"
|
||||
#define MM_INT_STR "integer"
|
||||
#define MM_GENERAL_STR "general"
|
||||
#define MM_SYMM_STR "symmetric"
|
||||
#define MM_HERM_STR "hermitian"
|
||||
#define MM_SKEW_STR "skew-symmetric"
|
||||
#define MM_PATTERN_STR "pattern"
|
||||
|
||||
|
||||
/* high level routines */
|
||||
|
||||
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
|
||||
double val[], MM_typecode matcode);
|
||||
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
|
||||
double val[], MM_typecode matcode);
|
||||
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
|
||||
MM_typecode matcode);
|
||||
|
||||
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
|
||||
double **val_, int **I_, int **J_);
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
53
examples/gmres/util.h
Normal file
53
examples/gmres/util.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __UTIL_H__
|
||||
#define __UTIL_H__
|
||||
|
||||
#include <stdio.h>
|
||||
#include "matrix.h"
|
||||
|
||||
|
||||
inline void printMatrix (DenseMatrix &M, const char *name) {
|
||||
printf("Matrix %s:\n", name);
|
||||
for (int row = 0; row < M.rows(); row++) {
|
||||
printf("row %2d: ", row + 1);
|
||||
for (int col = 0; col < M.cols(); col++)
|
||||
printf("%6f ", M(row, col));
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
#endif
|
||||
1760
examples/intrinsics/generic-16.h
Normal file
1760
examples/intrinsics/generic-16.h
Normal file
File diff suppressed because it is too large
Load Diff
1828
examples/intrinsics/generic-32.h
Normal file
1828
examples/intrinsics/generic-32.h
Normal file
File diff suppressed because it is too large
Load Diff
1961
examples/intrinsics/generic-64.h
Normal file
1961
examples/intrinsics/generic-64.h
Normal file
File diff suppressed because it is too large
Load Diff
2124
examples/intrinsics/knc.h
Normal file
2124
examples/intrinsics/knc.h
Normal file
File diff suppressed because it is too large
Load Diff
2058
examples/intrinsics/knc2x.h
Normal file
2058
examples/intrinsics/knc2x.h
Normal file
File diff suppressed because it is too large
Load Diff
3989
examples/intrinsics/sse4.h
Normal file
3989
examples/intrinsics/sse4.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,30 +1,7 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
EXAMPLE=mandelbrot
|
||||
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
|
||||
ISPC_SRC=mandelbrot.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
|
||||
default: mandelbrot
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ mandelbrot
|
||||
|
||||
OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
|
||||
objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
|
||||
objs/mandelbrot_ispc.o
|
||||
|
||||
mandelbrot: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -40,8 +40,10 @@ static inline int mandel(float c_re, float c_im, int count) {
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
float new_im = 2.f * z_re * z_im;
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
unmasked {
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
|
||||
@@ -1,39 +1,7 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
EXAMPLE=mandelbrot
|
||||
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
|
||||
ISPC_SRC=mandelbrot.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
|
||||
objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
|
||||
objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o
|
||||
|
||||
default: mandelbrot
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ mandelbrot
|
||||
|
||||
mandelbrot: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -41,25 +41,26 @@ mandel(float c_re, float c_im, int count) {
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
float new_im = 2.f * z_re * z_im;
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
unmasked {
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
|
||||
/* Task to compute the Mandelbrot iterations for a span of scanlines from
|
||||
[ystart,yend).
|
||||
/* Task to compute the Mandelbrot iterations for a single scanline.
|
||||
*/
|
||||
task void
|
||||
mandelbrot_scanlines(uniform int ybase, uniform int span,
|
||||
uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int maxIterations,
|
||||
uniform int output[]) {
|
||||
uniform int ystart = ybase + taskIndex * span;
|
||||
uniform int yend = ystart + span;
|
||||
mandelbrot_scanline(uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int height,
|
||||
uniform int span,
|
||||
uniform int maxIterations, uniform int output[]) {
|
||||
uniform int ystart = taskIndex * span;
|
||||
uniform int yend = min((taskIndex+1) * span, (unsigned int)height);
|
||||
|
||||
foreach (yi = ystart ... yend, xi = 0 ... width) {
|
||||
float x = x0 + xi * dx;
|
||||
@@ -71,20 +72,6 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
mandelbrot_chunk(uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations, uniform int output[]) {
|
||||
uniform int ystart = taskIndex * (height/taskCount);
|
||||
uniform int yend = (taskIndex+1) * (height/taskCount);
|
||||
uniform int span = 1;
|
||||
|
||||
launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
|
||||
width, maxIterations, output) >;
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float x1, uniform float y1,
|
||||
@@ -92,7 +79,8 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform int maxIterations, uniform int output[]) {
|
||||
uniform float dx = (x1 - x0) / width;
|
||||
uniform float dy = (y1 - y0) / height;
|
||||
uniform int span = 4;
|
||||
|
||||
launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
|
||||
maxIterations, output) >;
|
||||
launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
|
||||
maxIterations, output);
|
||||
}
|
||||
|
||||
@@ -1,29 +1,7 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
|
||||
EXAMPLE=noise
|
||||
CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
|
||||
ISPC_SRC=noise.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx-x2
|
||||
|
||||
OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
|
||||
objs/noise_ispc_sse4.o objs/noise_ispc_avx.o
|
||||
|
||||
default: noise
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ noise
|
||||
|
||||
noise: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/noise.o: objs/noise_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -1,38 +1,7 @@
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
EXAMPLE=options
|
||||
CPP_SRC=options.cpp options_serial.cpp
|
||||
ISPC_SRC=options.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -g -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
|
||||
objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
|
||||
objs/options_ispc_avx.o $(TASK_OBJ)
|
||||
|
||||
default: options
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ options
|
||||
|
||||
options: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/options.o: objs/options_ispc.h options_defs.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -77,7 +77,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int nTasks = max((int)64, (int)count/16384);
|
||||
launch[nTasks] < bs_task(Sa, Xa, Ta, ra, va, result, count) >;
|
||||
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
}
|
||||
|
||||
|
||||
@@ -150,5 +150,5 @@ binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int nTasks = max((int)64, (int)count/16384);
|
||||
launch[nTasks] < binomial_task(Sa, Xa, Ta, ra, va, result, count) >;
|
||||
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
}
|
||||
|
||||
7
examples/perfbench/Makefile
Normal file
7
examples/perfbench/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
EXAMPLE=perbench
|
||||
CPP_SRC=perfbench.cpp perfbench_serial.cpp
|
||||
ISPC_SRC=perfbench.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx
|
||||
|
||||
include ../common.mk
|
||||
108
examples/perfbench/perfbench.cpp
Normal file
108
examples/perfbench/perfbench.cpp
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
|
||||
#include "perfbench_ispc.h"
|
||||
|
||||
typedef void (FuncType)(float *, int, float *, float *);
|
||||
|
||||
struct PerfTest {
|
||||
FuncType *aFunc;
|
||||
const char *aName;
|
||||
FuncType *bFunc;
|
||||
const char *bName;
|
||||
const char *testName;
|
||||
};
|
||||
|
||||
extern void xyzSumAOS(float *a, int count, float *zeros, float *result);
|
||||
extern void xyzSumSOA(float *a, int count, float *zeros, float *result);
|
||||
|
||||
|
||||
static void
|
||||
lInitData(float *ptr, int count) {
|
||||
for (int i = 0; i < count; ++i)
|
||||
ptr[i] = float(i) / (1024.f * 1024.f);
|
||||
}
|
||||
|
||||
static PerfTest tests[] = {
|
||||
{ xyzSumAOS, "serial", ispc::xyzSumAOS, "ispc", "AOS vector element sum (with coalescing)" },
|
||||
{ xyzSumAOS, "serial", ispc::xyzSumAOSStdlib, "ispc", "AOS vector element sum (stdlib swizzle)" },
|
||||
{ xyzSumAOS, "serial", ispc::xyzSumAOSNoCoalesce, "ispc", "AOS vector element sum (no coalescing)" },
|
||||
{ xyzSumSOA, "serial", ispc::xyzSumSOA, "ispc", "SOA vector element sum" },
|
||||
{ ispc::gathers, "gather", ispc::loads, "vector load", "Memory reads" },
|
||||
{ ispc::scatters, "scatter", ispc::stores, "vector store", "Memory writes" },
|
||||
};
|
||||
|
||||
int main() {
|
||||
int count = 3*64*1024;
|
||||
float *a = new float[count];
|
||||
float zeros[32] = { 0 };
|
||||
|
||||
int nTests = sizeof(tests) / sizeof(tests[0]);
|
||||
for (int i = 0; i < nTests; ++i) {
|
||||
lInitData(a, count);
|
||||
reset_and_start_timer();
|
||||
float resultA[3] = { 0, 0, 0 };
|
||||
for (int j = 0; j < 100; ++j)
|
||||
tests[i].aFunc(a, count, zeros, resultA);
|
||||
double aTime = get_elapsed_mcycles();
|
||||
|
||||
lInitData(a, count);
|
||||
reset_and_start_timer();
|
||||
float resultB[3] = { 0, 0, 0 };
|
||||
for (int j = 0; j < 100; ++j)
|
||||
tests[i].bFunc(a, count, zeros, resultB);
|
||||
double bTime = get_elapsed_mcycles();
|
||||
|
||||
printf("%-40s: [%.2f] M cycles %s, [%.2f] M cycles %s (%.2fx speedup).\n",
|
||||
tests[i].testName, aTime, tests[i].aName, bTime, tests[i].bName,
|
||||
aTime/bTime);
|
||||
#if 0
|
||||
printf("\t(%f %f %f) - (%f %f %f)\n", resultSerial[0], resultSerial[1],
|
||||
resultSerial[2], resultISPC[0], resultISPC[1], resultISPC[2]);
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user