Compare commits
742 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b8503b9255 | ||
|
|
b7bc76d3cc | ||
|
|
27d6c12972 | ||
|
|
b69d783e09 | ||
|
|
3b2ff6301c | ||
|
|
6c7043916e | ||
|
|
96a6e75b71 | ||
|
|
a91e4e7981 | ||
|
|
95d8f76ec3 | ||
|
|
66d4c2ddd9 | ||
|
|
e431b07e04 | ||
|
|
d34a87404d | ||
|
|
f38770bf2a | ||
|
|
2a4dff38d0 | ||
|
|
f558ee788e | ||
|
|
ceb8ca680c | ||
|
|
79ebcbec4b | ||
|
|
2c7b650240 | ||
|
|
54459255d4 | ||
|
|
b4a078e2f6 | ||
|
|
ed13dd066b | ||
|
|
2b4a3b22bf | ||
|
|
8b891da628 | ||
|
|
5a2c8342eb | ||
|
|
50eb4bf53a | ||
|
|
3c10ddd46a | ||
|
|
0b7f9acc70 | ||
|
|
10fbaec247 | ||
|
|
007a734595 | ||
|
|
46716aada3 | ||
|
|
3bc66136b2 | ||
|
|
fae47e0dfc | ||
|
|
bd52e86486 | ||
|
|
b2f6ed7209 | ||
|
|
4b334fd2e2 | ||
|
|
a23a7006e3 | ||
|
|
f47171a17c | ||
|
|
4945dc3682 | ||
|
|
ada66b5313 | ||
|
|
96450e17a3 | ||
|
|
40a295e951 | ||
|
|
d6c6f95373 | ||
|
|
19b46be20d | ||
|
|
789e04ce90 | ||
|
|
dd4f0a600b | ||
|
|
6c7df4cb6b | ||
|
|
79e0a9f32a | ||
|
|
6c9bc63a1c | ||
|
|
28a821df7d | ||
|
|
27e39954d6 | ||
|
|
e730a5364b | ||
|
|
92b3ae41dd | ||
|
|
89a2566e01 | ||
|
|
1ac3e03171 | ||
|
|
b86d40091a | ||
|
|
91d22d150f | ||
|
|
1d29991268 | ||
|
|
6f0a2686dc | ||
|
|
f06caabb07 | ||
|
|
3c869802fb | ||
|
|
7b6bd90903 | ||
|
|
967bfa9c92 | ||
|
|
592affb984 | ||
|
|
96aaf6d53b | ||
|
|
1397dbdabc | ||
|
|
6118643232 | ||
|
|
71198a0b54 | ||
|
|
22cb80399f | ||
|
|
6df7d31a5b | ||
|
|
ef049e92ef | ||
|
|
fe8b109ca5 | ||
|
|
8fd9b84a80 | ||
|
|
5cb53f52c3 | ||
|
|
d86653668e | ||
|
|
5084712a15 | ||
|
|
ece65cab18 | ||
|
|
1f6075506c | ||
|
|
51ade48e3d | ||
|
|
21c43737fe | ||
|
|
6c7bcf00e7 | ||
|
|
7a2142075c | ||
|
|
e8e9baa417 | ||
|
|
449d956966 | ||
|
|
90db01d038 | ||
|
|
38cea6dc71 | ||
|
|
64807dfb3b | ||
|
|
d943455e10 | ||
|
|
fd03ba7586 | ||
|
|
2c5a57e386 | ||
|
|
e8858150cb | ||
|
|
333f901187 | ||
|
|
7dd4d6c75e | ||
|
|
99f57cfda6 | ||
|
|
4d1eb94dfd | ||
|
|
22d584f302 | ||
|
|
72c41f104e | ||
|
|
8d3ac3ac1e | ||
|
|
299ae186f1 | ||
|
|
f4df2fb176 | ||
|
|
625fbef613 | ||
|
|
fbed0ac56b | ||
|
|
dc120f3962 | ||
|
|
4f053e5b83 | ||
|
|
c6241581a0 | ||
|
|
041ade66d5 | ||
|
|
067a2949ba | ||
|
|
55c754750e | ||
|
|
72b6c12856 | ||
|
|
15ea0af687 | ||
|
|
ee7e367981 | ||
|
|
8006589828 | ||
|
|
413264eaae | ||
|
|
7db8824da2 | ||
|
|
e1bc010bd1 | ||
|
|
bff02017da | ||
|
|
c0019bd8e5 | ||
|
|
e495ef2c48 | ||
|
|
78d62705cc | ||
|
|
2791bd0015 | ||
|
|
7cf66eb61f | ||
|
|
944c53bff1 | ||
|
|
c756c855ea | ||
|
|
58bb2826b2 | ||
|
|
b7bef87a4d | ||
|
|
0c1b206185 | ||
|
|
7d7e99a92c | ||
|
|
1ba8d7ef74 | ||
|
|
d99bd279e8 | ||
|
|
ee1fe3aa9f | ||
|
|
c4b1d79c5c | ||
|
|
a1a43cdfe0 | ||
|
|
27b62781cc | ||
|
|
0c5d7ff8f2 | ||
|
|
0e2b315ded | ||
|
|
3e74d1c544 | ||
|
|
da690acce5 | ||
|
|
0baa2b484d | ||
|
|
260d7298c3 | ||
|
|
d5cc2ad643 | ||
|
|
12706cd37f | ||
|
|
7167442d6e | ||
|
|
8547101c4b | ||
|
|
5d58a9e4c2 | ||
|
|
cd98a29a4b | ||
|
|
903714fd40 | ||
|
|
138c7acf22 | ||
|
|
03b2b8ae8f | ||
|
|
016b502d46 | ||
|
|
c5f6653564 | ||
|
|
cf9a4e209e | ||
|
|
040421942f | ||
|
|
4dfc596d38 | ||
|
|
fe83ef7635 | ||
|
|
db8b08131f | ||
|
|
32815e628d | ||
|
|
71bdc67a45 | ||
|
|
cb9f50ef63 | ||
|
|
12c754c92b | ||
|
|
e4b3d03da5 | ||
|
|
cc26b66e99 | ||
|
|
34d81fa522 | ||
|
|
49f1a5c2b3 | ||
|
|
326c45fa17 | ||
|
|
a2bb899a6b | ||
|
|
9fedb1674e | ||
|
|
7c91b01125 | ||
|
|
c202e9e106 | ||
|
|
645a8c9349 | ||
|
|
abf7c423bb | ||
|
|
55d5c07d00 | ||
|
|
b9d6ba2aa0 | ||
|
|
a0c9f7823b | ||
|
|
99a27fe241 | ||
|
|
fefa86e0cf | ||
|
|
098c4910de | ||
|
|
17b7148300 | ||
|
|
f4a2ef28e3 | ||
|
|
f0d013ee76 | ||
|
|
5ece6fec04 | ||
|
|
d88dbf3612 | ||
|
|
2a18efef82 | ||
|
|
fd846fbe77 | ||
|
|
ca7cc4744e | ||
|
|
491fa239bd | ||
|
|
66765dc123 | ||
|
|
70a5348f43 | ||
|
|
2aa61007c6 | ||
|
|
acfbe77ffc | ||
|
|
08696653ca | ||
|
|
8a1a214ca9 | ||
|
|
7aaeb27e0f | ||
|
|
972043c146 | ||
|
|
8475dc082a | ||
|
|
d0e583b29c | ||
|
|
c8feee238b | ||
|
|
6712ecd928 | ||
|
|
d0c7b5d35c | ||
|
|
802add1f97 | ||
|
|
95556811fa | ||
|
|
581472564d | ||
|
|
c7dc8862a5 | ||
|
|
4f8cf019ca | ||
|
|
4c9ac7fcf1 | ||
|
|
1dac05960a | ||
|
|
c27418da77 | ||
|
|
637d076e99 | ||
|
|
391678a5b3 | ||
|
|
4cd0cf1650 | ||
|
|
b813452d33 | ||
|
|
eb85da81e1 | ||
|
|
920cf63201 | ||
|
|
dc09d46bf4 | ||
|
|
05d1b06eeb | ||
|
|
c1661eb06b | ||
|
|
e9626a1d10 | ||
|
|
560bf5ca09 | ||
|
|
87c8a89349 | ||
|
|
255791f18e | ||
|
|
d5e3416e8e | ||
|
|
5b2d43f665 | ||
|
|
540fc6c2f3 | ||
|
|
b3c5043dcc | ||
|
|
d0d9aae968 | ||
|
|
3270e2bf5a | ||
|
|
013a3e7567 | ||
|
|
8368ba8539 | ||
|
|
ca0310e335 | ||
|
|
4690a678c1 | ||
|
|
f8a39402a2 | ||
|
|
247775d1ec | ||
|
|
6e9fea377d | ||
|
|
ca5c65d032 | ||
|
|
f9dc621ebe | ||
|
|
ffe484c31e | ||
|
|
62cd3418ca | ||
|
|
d8a8f3a996 | ||
|
|
0ad8dbbfc9 | ||
|
|
e15a1946c6 | ||
|
|
8878826661 | ||
|
|
95a8b6e5e8 | ||
|
|
388d0d2cfd | ||
|
|
d3a374e71c | ||
|
|
1da2834b1e | ||
|
|
ca3100874f | ||
|
|
117f48a331 | ||
|
|
89bbceefee | ||
|
|
7e18f0e247 | ||
|
|
3bb2dee275 | ||
|
|
88cd5584e8 | ||
|
|
20044f5749 | ||
|
|
10c5ba140c | ||
|
|
316de0b880 | ||
|
|
989966f81b | ||
|
|
ccd550dc52 | ||
|
|
ddf350839a | ||
|
|
6a7dd2787a | ||
|
|
349ab0b9c5 | ||
|
|
b5e6c6a2f3 | ||
|
|
2832ea641f | ||
|
|
cb7edf2725 | ||
|
|
f1f1be2822 | ||
|
|
7dffd65609 | ||
|
|
2c8a44e28b | ||
|
|
39bb95a6ee | ||
|
|
da9dba80a0 | ||
|
|
12f3285f9b | ||
|
|
7e954e4248 | ||
|
|
d74cc6397b | ||
|
|
777343331e | ||
|
|
a062653743 | ||
|
|
57af0eb64f | ||
|
|
60aae16752 | ||
|
|
e264d95019 | ||
|
|
0664f5a724 | ||
|
|
17c6a19527 | ||
|
|
cbc8b8259b | ||
|
|
1067a2e4be | ||
|
|
74a031a759 | ||
|
|
ee437193fb | ||
|
|
436c53037e | ||
|
|
f55ba9d3cb | ||
|
|
8adb99b768 | ||
|
|
13c42412d2 | ||
|
|
75507d8b35 | ||
|
|
ddfe4932ac | ||
|
|
28ac016928 | ||
|
|
9ec8e5a275 | ||
|
|
a473046058 | ||
|
|
a69b7a5a01 | ||
|
|
640918bcc0 | ||
|
|
f39fbdb3fc | ||
|
|
50d4d81062 | ||
|
|
3b95452481 | ||
|
|
c152ae3c32 | ||
|
|
f6cbaa78e8 | ||
|
|
7adb250b59 | ||
|
|
db5db5aefd | ||
|
|
8fdf84de04 | ||
|
|
ff5cbe80d1 | ||
|
|
e013e0a374 | ||
|
|
b7df312ca7 | ||
|
|
ce82c3c0ae | ||
|
|
2f958cfbda | ||
|
|
8ef41dfd97 | ||
|
|
3082ea4765 | ||
|
|
e482d29951 | ||
|
|
ff48dd7bfb | ||
|
|
7bf9c11822 | ||
|
|
f7937f1e4b | ||
|
|
0115eeabfe | ||
|
|
4b9c3ec0da | ||
|
|
55b81e35a7 | ||
|
|
2a1c7f2d47 | ||
|
|
8603f9838f | ||
|
|
95224f3f11 | ||
|
|
f81acbfe80 | ||
|
|
6d7ff7eba2 | ||
|
|
ad429db7e8 | ||
|
|
4c07abbaf4 | ||
|
|
e3c0551129 | ||
|
|
8971baa42b | ||
|
|
317a1f51f7 | ||
|
|
c63d139482 | ||
|
|
9e682362e9 | ||
|
|
56ec939692 | ||
|
|
a86b942730 | ||
|
|
52eb4c6014 | ||
|
|
f4adbbf90c | ||
|
|
cc86e4a7d2 | ||
|
|
e864447e4a | ||
|
|
73bf552cd6 | ||
|
|
f20a2d2ee9 | ||
|
|
0c25bc063c | ||
|
|
db72781d2a | ||
|
|
0c8ad09040 | ||
|
|
49880ab761 | ||
|
|
fe2d9aa600 | ||
|
|
1dead425e4 | ||
|
|
adb1e47a59 | ||
|
|
ffba8580c1 | ||
|
|
ea18427d29 | ||
|
|
f3089df086 | ||
|
|
157e7c97ae | ||
|
|
bb8e13e3c9 | ||
|
|
5b4673e8eb | ||
|
|
5b9de8cc07 | ||
|
|
33ea934c8f | ||
|
|
6b3e14b0a4 | ||
|
|
098ceb5567 | ||
|
|
8e2b0632e8 | ||
|
|
420d373d89 | ||
|
|
a59fd7eeb3 | ||
|
|
ee91fa1228 | ||
|
|
a2b5ce0172 | ||
|
|
3efbc71a01 | ||
|
|
b7c5af7e64 | ||
|
|
f939015b97 | ||
|
|
a9ed71f553 | ||
|
|
96a429694f | ||
|
|
fddc5e022e | ||
|
|
2236d53def | ||
|
|
4e018d0a20 | ||
|
|
977b983771 | ||
|
|
fa7a7fe23e | ||
|
|
724a843bbd | ||
|
|
a9ec745275 | ||
|
|
c2ecc15b93 | ||
|
|
83c8650b36 | ||
|
|
89cb809922 | ||
|
|
fdb4eaf437 | ||
|
|
0432f97555 | ||
|
|
8d1631b714 | ||
|
|
dac091552d | ||
|
|
ea027a95a8 | ||
|
|
f73abb05a7 | ||
|
|
d71c49494f | ||
|
|
25665f0841 | ||
|
|
1eec27f890 | ||
|
|
950f86200b | ||
|
|
e19f4931d1 | ||
|
|
0575b1f38d | ||
|
|
f6cd01f7cf | ||
|
|
f2fbc168af | ||
|
|
b50f6f1730 | ||
|
|
f8a7120d9c | ||
|
|
20dbf59420 | ||
|
|
c67a286aa6 | ||
|
|
c96fef6bc8 | ||
|
|
bba02f87ea | ||
|
|
12dc3f5c28 | ||
|
|
0f01a5dcbe | ||
|
|
664dc3bdda | ||
|
|
bdba3cd97d | ||
|
|
d9c0f9315a | ||
|
|
b7f17d435f | ||
|
|
37cdc18639 | ||
|
|
5893a9c49d | ||
|
|
24f58fa16a | ||
|
|
56ffc78fa4 | ||
|
|
061e68bc77 | ||
|
|
177e6312b4 | ||
|
|
1acf4032c2 | ||
|
|
9c5444698e | ||
|
|
65f3252760 | ||
|
|
e612abe4ba | ||
|
|
34352e4e0e | ||
|
|
1867b5b317 | ||
|
|
a5b7fca7e0 | ||
|
|
7be2c399b1 | ||
|
|
d6337b3b22 | ||
|
|
d2f8b0ace5 | ||
|
|
d805e8b183 | ||
|
|
1f0f2ec05f | ||
|
|
91ac3b9d7c | ||
|
|
d65bf2eb2f | ||
|
|
1bba9d4307 | ||
|
|
4388338dad | ||
|
|
2fb59c90cf | ||
|
|
68f6ea8def | ||
|
|
3f89295d10 | ||
|
|
748b292e77 | ||
|
|
6451c3d99d | ||
|
|
d14a2de168 | ||
|
|
642150095d | ||
|
|
3bf3ac7922 | ||
|
|
c6d1cebad4 | ||
|
|
08189ce08c | ||
|
|
7013d7d52f | ||
|
|
7045b76f84 | ||
|
|
58a0b4a20d | ||
|
|
0f8eee9809 | ||
|
|
0740299860 | ||
|
|
652215861e | ||
|
|
602209e5a8 | ||
|
|
b60f8b4f70 | ||
|
|
b67446d998 | ||
|
|
9670ab0887 | ||
|
|
0223bb85ee | ||
|
|
fd81255db1 | ||
|
|
8a8e1a7f73 | ||
|
|
ef05fbf424 | ||
|
|
fa01b63fa5 | ||
|
|
63d3d25030 | ||
|
|
a8db866228 | ||
|
|
0519eea951 | ||
|
|
f4653ecd11 | ||
|
|
5d67252ed0 | ||
|
|
5134de71c0 | ||
|
|
2be1251c70 | ||
|
|
c0161aa17f | ||
|
|
b683aa11b1 | ||
|
|
2654bb0112 | ||
|
|
d8728104b4 | ||
|
|
0be1b70fba | ||
|
|
a0e9793de3 | ||
|
|
da9200fcee | ||
|
|
54e8e8022b | ||
|
|
d84cf781da | ||
|
|
002f27a30f | ||
|
|
86d88e9773 | ||
|
|
fda00afe6e | ||
|
|
be0c77d556 | ||
|
|
0ed11a7832 | ||
|
|
ff6971fb15 | ||
|
|
5b4dbc8167 | ||
|
|
59f4c9985e | ||
|
|
8da9be1a09 | ||
|
|
11033e108e | ||
|
|
4f97262cf2 | ||
|
|
9b68b9087a | ||
|
|
15cc812e37 | ||
|
|
71317e6aa6 | ||
|
|
1abaaee73e | ||
|
|
78c6d3c02f | ||
|
|
48e9d4af39 | ||
|
|
cb7ad371c6 | ||
|
|
2951589825 | ||
|
|
f23dc5366a | ||
|
|
e3341176c5 | ||
|
|
8938e14442 | ||
|
|
4151778f5e | ||
|
|
23b85cd88d | ||
|
|
234e5cd3e1 | ||
|
|
f75c94a8f1 | ||
|
|
848a432640 | ||
|
|
dea13979e0 | ||
|
|
052d34bf5b | ||
|
|
d4c5e82896 | ||
|
|
562d61caff | ||
|
|
75f18c7c66 | ||
|
|
5d35349dc9 | ||
|
|
1a81173c93 | ||
|
|
1d9201fe3d | ||
|
|
6dbb15027a | ||
|
|
f23d030e43 | ||
|
|
701334ccf2 | ||
|
|
f48a662ed3 | ||
|
|
ced3f1f5fc | ||
|
|
018aa96c8b | ||
|
|
34eda04d9b | ||
|
|
45767ad197 | ||
|
|
f9463af75b | ||
|
|
6f6e28077f | ||
|
|
0a9a7c939a | ||
|
|
f30a5dea79 | ||
|
|
018b547c40 | ||
|
|
e82a720223 | ||
|
|
8d1b77b235 | ||
|
|
b8987faeee | ||
|
|
17fdab2793 | ||
|
|
1fa6520cb6 | ||
|
|
b6af5c16c6 | ||
|
|
10ebe88abf | ||
|
|
c0b41ad6f5 | ||
|
|
9920b30318 | ||
|
|
07f218137a | ||
|
|
89a5248f4f | ||
|
|
891919074e | ||
|
|
4adf527a4d | ||
|
|
533b539780 | ||
|
|
6f26ae9801 | ||
|
|
ddcdfff3ae | ||
|
|
5b48354d9a | ||
|
|
46bfef3fce | ||
|
|
20536bb339 | ||
|
|
f6605ee465 | ||
|
|
034507a35b | ||
|
|
0b2febcec0 | ||
|
|
d2fa735ef1 | ||
|
|
20f34b67da | ||
|
|
03f3db1e89 | ||
|
|
9805b0742d | ||
|
|
6000c696b2 | ||
|
|
5a2edf723b | ||
|
|
aec7da740a | ||
|
|
a79bc75b72 | ||
|
|
eaaebf7928 | ||
|
|
198aa9620e | ||
|
|
27c53a3c25 | ||
|
|
bd70182369 | ||
|
|
04df63d955 | ||
|
|
d59131d670 | ||
|
|
9475e13d81 | ||
|
|
765d86076f | ||
|
|
e2b6ed3db8 | ||
|
|
b22943b4a4 | ||
|
|
13df5f1cb9 | ||
|
|
f19c2aba40 | ||
|
|
ffc1d97df7 | ||
|
|
9dd498718b | ||
|
|
6181ce59ae | ||
|
|
48a6c2a35b | ||
|
|
0388f46a3b | ||
|
|
e3cae098fe | ||
|
|
455d963962 | ||
|
|
d748c501c9 | ||
|
|
5b8596102a | ||
|
|
dc525f281d | ||
|
|
f95504fb5e | ||
|
|
32904dfa11 | ||
|
|
186d0223d2 | ||
|
|
3efbfc30b7 | ||
|
|
0fd7811344 | ||
|
|
d492ba08e6 | ||
|
|
a1c0b4f95a | ||
|
|
c3b55de1ad | ||
|
|
e07ef6d46a | ||
|
|
3e4d69cbd3 | ||
|
|
511a3ab15a | ||
|
|
24ef9dac8f | ||
|
|
3bb6bff15d | ||
|
|
1390aed99c | ||
|
|
82aa6efd12 | ||
|
|
f90aa172a6 | ||
|
|
a2f118a14e | ||
|
|
c5aecd51e9 | ||
|
|
4d6bcdf41c | ||
|
|
8bc7367109 | ||
|
|
b48775a549 | ||
|
|
d4d6bc5d7f | ||
|
|
7a2561c429 | ||
|
|
1703f2717c | ||
|
|
92c46a2fc7 | ||
|
|
c995902796 | ||
|
|
6b9b7437ed | ||
|
|
a3641d7691 | ||
|
|
e780662a3f | ||
|
|
d65c02f323 | ||
|
|
11547cb950 | ||
|
|
bbb32c0c5d | ||
|
|
b1ae307163 | ||
|
|
e52104ff55 | ||
|
|
4ca90272ba | ||
|
|
2a6e3e5fea | ||
|
|
867efc2bce | ||
|
|
975db80ef6 | ||
|
|
15a7d353ab | ||
|
|
5828f7da07 | ||
|
|
d3e6879223 | ||
|
|
068ea3e4c4 | ||
|
|
f5a21d96a1 | ||
|
|
7290f7b16b | ||
|
|
79684a0bed | ||
|
|
6c8a064a5a | ||
|
|
e5327a0f5a | ||
|
|
f8eb100c60 | ||
|
|
7a1ce558e9 | ||
|
|
cabe358c0a | ||
|
|
b0d476fcdc | ||
|
|
51ccfffbd0 | ||
|
|
5fc8df3e55 | ||
|
|
ba9bb3338f | ||
|
|
afcd42028f | ||
|
|
f1d8ff96ce | ||
|
|
d528533fba | ||
|
|
7d6f89c8d2 | ||
|
|
43a2d510bf | ||
|
|
6084d6aeaf | ||
|
|
d224252b5d | ||
|
|
e009c0a61d | ||
|
|
d5a8538192 | ||
|
|
cc298cd5fe | ||
|
|
8b719e4c4e | ||
|
|
a7dff17b35 | ||
|
|
074cbc2716 | ||
|
|
114cb5b5c7 | ||
|
|
f45ab0744e | ||
|
|
9b8ea3d500 | ||
|
|
290032f4f5 | ||
|
|
19087e4761 | ||
|
|
70047fbf5f | ||
|
|
39ed7e14b2 | ||
|
|
209d093720 | ||
|
|
fc2954419d | ||
|
|
422b8268a9 | ||
|
|
1ab05c0351 | ||
|
|
c21e704a5c | ||
|
|
9f2aa8d92a | ||
|
|
2460fa5c83 | ||
|
|
dce25249ce | ||
|
|
61adc74072 | ||
|
|
88e317f1a9 | ||
|
|
49454bc207 | ||
|
|
286c23426e | ||
|
|
1198520029 | ||
|
|
06d70376ea | ||
|
|
7cd7ca82d6 | ||
|
|
ecda4561bd | ||
|
|
a89e26d725 | ||
|
|
3cb0115dce | ||
|
|
f5391747b9 | ||
|
|
b8768ffdfa | ||
|
|
6009608bc6 | ||
|
|
ce7355f9ed | ||
|
|
6b4459d402 | ||
|
|
790dba2558 | ||
|
|
4a2cbf2c4e | ||
|
|
53dd65fa2e | ||
|
|
f5afa52fd9 | ||
|
|
f9c67ff806 | ||
|
|
ec5e627e56 | ||
|
|
ff2a43ac19 | ||
|
|
9feea32471 | ||
|
|
bedaec2295 | ||
|
|
a68d137df6 | ||
|
|
59caa3d4e1 | ||
|
|
06975bc7ab | ||
|
|
880cbb18cc | ||
|
|
686d9975b6 | ||
|
|
9b7f55a28e | ||
|
|
e4d224a0f1 | ||
|
|
0933a77c1b | ||
|
|
5f78edf07a | ||
|
|
a6fc657b40 | ||
|
|
fa5050d5c7 | ||
|
|
d5a48d9a1e | ||
|
|
2df9da2524 | ||
|
|
0b02f94988 | ||
|
|
65c50b60fc | ||
|
|
9de34eb22c | ||
|
|
f8f25a11b6 | ||
|
|
cb7976bbf6 | ||
|
|
5ee4d7fce8 | ||
|
|
8f3e46f67e | ||
|
|
9ed07ff2b5 | ||
|
|
32a0a30cf5 | ||
|
|
6d39d5fc3e | ||
|
|
c999c8a237 | ||
|
|
aad269fdf4 | ||
|
|
d45c536c47 | ||
|
|
f1b8e5b1bf | ||
|
|
e7a70b05af | ||
|
|
cf73286938 | ||
|
|
e6f80c0adc | ||
|
|
5e31d7b6d0 | ||
|
|
649f2ad7b7 | ||
|
|
fade1cdf1d | ||
|
|
d261105a86 | ||
|
|
b3d3e8987b | ||
|
|
4e91f3777a | ||
|
|
5584240c7f | ||
|
|
7126a39092 | ||
|
|
8ad28a3f6f | ||
|
|
9921b8e530 | ||
|
|
9052d4b10b | ||
|
|
2405dae8e6 | ||
|
|
3607f3e045 | ||
|
|
de84acfa5d | ||
|
|
a501ab1aa6 | ||
|
|
cdc850f98c | ||
|
|
ca87579f23 | ||
|
|
38fc13d1ab | ||
|
|
cf9d9f717e | ||
|
|
173632f446 | ||
|
|
1dedd88132 | ||
|
|
0848c2cc19 | ||
|
|
e2a88d491f | ||
|
|
30f9dcd4f5 | ||
|
|
0c344b6755 | ||
|
|
6734021520 | ||
|
|
dd153d3c5c | ||
|
|
9ca7541d52 | ||
|
|
0c20483853 | ||
|
|
9d4ff1bc06 | ||
|
|
83f22f1939 | ||
|
|
6375ed9224 | ||
|
|
cf23cf9ef4 | ||
|
|
1147b53dcd | ||
|
|
4cf831a651 | ||
|
|
785d8a29d3 | ||
|
|
46d2bad231 | ||
|
|
32da8e11b4 | ||
|
|
5dedb6f836 | ||
|
|
2ea6d249d5 | ||
|
|
c86128e8ee | ||
|
|
375f1cb8e8 | ||
|
|
3ca7b6b078 | ||
|
|
effe901890 | ||
|
|
4f451bd041 | ||
|
|
c76ef7b174 | ||
|
|
743d82e935 | ||
|
|
18546e9c6d |
9
.gitignore
vendored
9
.gitignore
vendored
@@ -5,4 +5,11 @@ ispc
|
||||
ispc_test
|
||||
objs
|
||||
docs/doxygen
|
||||
docs/ispc.html
|
||||
docs/*.html
|
||||
tests*/*cpp
|
||||
tests*/*run
|
||||
examples/*/*.png
|
||||
examples/*/*.ppm
|
||||
examples/*/objs/*
|
||||
|
||||
|
||||
|
||||
143
Makefile
143
Makefile
@@ -2,41 +2,76 @@
|
||||
# ispc Makefile
|
||||
#
|
||||
|
||||
# If you have your own special version of llvm and/or clang, change
|
||||
# these variables to match.
|
||||
LLVM_CONFIG=$(shell which llvm-config)
|
||||
CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
|
||||
|
||||
# Add llvm bin to the path so any scripts run will go to the right llvm-config
|
||||
LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
|
||||
export PATH:=$(LLVM_BIN):$(PATH)
|
||||
|
||||
ARCH_OS = $(shell uname)
|
||||
ifeq ($(ARCH_OS), Darwin)
|
||||
ARCH_OS2 = "OSX"
|
||||
else
|
||||
ARCH_OS2 = $(shell uname -o)
|
||||
endif
|
||||
ARCH_TYPE = $(shell arch)
|
||||
|
||||
ifeq ($(shell $(LLVM_CONFIG) --version), 3.0)
|
||||
LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs)
|
||||
else
|
||||
LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker \
|
||||
-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo \
|
||||
-lLLVMBitWriter -lLLVMTableGen \
|
||||
-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG \
|
||||
-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info \
|
||||
-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler -lLLVMMCParser \
|
||||
-lLLVMCodeGen -lLLVMScalarOpts -lLLVMInstCombine -lLLVMTransformUtils \
|
||||
-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld \
|
||||
-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore \
|
||||
-lLLVMSupport
|
||||
endif
|
||||
|
||||
CLANG=clang
|
||||
CLANG_LIBS = -lclangFrontend -lclangDriver \
|
||||
-lclangSerialization -lclangParse -lclangSema \
|
||||
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
|
||||
ifneq ($(shell $(LLVM_CONFIG) --version), 3.0)
|
||||
CLANG_LIBS += -lclangEdit
|
||||
endif
|
||||
|
||||
ISPC_LIBS=$(CLANG_LIBS) \
|
||||
$(shell llvm-config --ldflags --libs) \
|
||||
-lpthread -ldl
|
||||
ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
|
||||
-lpthread -ldl
|
||||
ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
|
||||
-lpthread
|
||||
|
||||
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
|
||||
LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
|
||||
LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
ISPC_LIBS += -ldl
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_OS2),Msys)
|
||||
ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
|
||||
endif
|
||||
|
||||
LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
|
||||
LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
|
||||
LLVM_VERSION_DEF=-D$(LLVM_VERSION)
|
||||
|
||||
BUILD_DATE=$(shell date +%Y%m%d)
|
||||
BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
|
||||
|
||||
CXX=g++
|
||||
CPP=cpp
|
||||
CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
|
||||
OPT=-O2
|
||||
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \
|
||||
-Wall $(LLVM_VERSION_DEF) \
|
||||
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
|
||||
|
||||
LDFLAGS=
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
# try to link everything statically under Linux (including libstdc++) so
|
||||
# that the binaries we generate will be portable across distributions...
|
||||
ifeq ($(ARCH_TYPE),x86_64)
|
||||
LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
|
||||
else
|
||||
LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
|
||||
endif
|
||||
LDFLAGS=-static
|
||||
endif
|
||||
|
||||
LEX=flex
|
||||
@@ -44,21 +79,25 @@ YACC=bison -d -v -t
|
||||
|
||||
###########################################################################
|
||||
|
||||
CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
|
||||
llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
|
||||
util.cpp
|
||||
HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
|
||||
CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
|
||||
ispc.cpp llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp \
|
||||
type.cpp util.cpp
|
||||
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
|
||||
builtins-sse4.ll builtins-sse4x2.ll
|
||||
TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
|
||||
generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
|
||||
BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
|
||||
builtins/dispatch.ll
|
||||
BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
|
||||
builtins-c-32.cpp builtins-c-64.cpp
|
||||
BISON_SRC=parse.yy
|
||||
FLEX_SRC=lex.ll
|
||||
|
||||
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
|
||||
builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
|
||||
$(FLEX_SRC:.ll=.o))
|
||||
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
|
||||
stdlib_generic_ispc.o stdlib_x86_ispc.o \
|
||||
$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
|
||||
|
||||
default: ispc ispc_test
|
||||
default: ispc
|
||||
|
||||
.PHONY: dirs clean depend doxygen print_llvm_src
|
||||
.PRECIOUS: objs/builtins-%.cpp
|
||||
@@ -77,7 +116,7 @@ print_llvm_src:
|
||||
@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs ispc ispc_test
|
||||
/bin/rm -rf objs ispc
|
||||
|
||||
doxygen:
|
||||
/bin/rm -rf docs/doxygen
|
||||
@@ -85,16 +124,20 @@ doxygen:
|
||||
|
||||
ispc: print_llvm_src dirs $(OBJS)
|
||||
@echo Creating ispc executable
|
||||
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
|
||||
|
||||
ispc_test: dirs ispc_test.cpp
|
||||
@echo Creating ispc_test executable
|
||||
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
|
||||
@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/cbackend.o: cbackend.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/%.o: objs/%.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/parse.cc: parse.yy
|
||||
@echo Running bison on $<
|
||||
@$(YACC) -o $@ $<
|
||||
@@ -111,34 +154,24 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
|
||||
@echo Creating C++ source from builtin definitions file $<
|
||||
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
||||
|
||||
objs/builtins-%.o: objs/builtins-%.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-c-32.cpp: builtins-c.c
|
||||
objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@
|
||||
|
||||
objs/builtins-c-32.o: objs/builtins-c-32.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-c-64.cpp: builtins-c.c
|
||||
objs/builtins-c-32.cpp: builtins/builtins.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
|
||||
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@
|
||||
|
||||
objs/builtins-c-64.o: objs/builtins-c-64.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
objs/builtins-c-64.cpp: builtins/builtins.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@
|
||||
|
||||
objs/stdlib_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $<
|
||||
@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
|
||||
objs/stdlib_generic_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $< for generic
|
||||
@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
|
||||
python stdlib2cpp.py generic > $@
|
||||
|
||||
objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
objs/stdlib_x86_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $< for x86
|
||||
@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
|
||||
python stdlib2cpp.py x86 > $@
|
||||
|
||||
90
README.rst
Normal file
90
README.rst
Normal file
@@ -0,0 +1,90 @@
|
||||
==============================
|
||||
Intel(r) SPMD Program Compiler
|
||||
==============================
|
||||
|
||||
``ispc`` is a compiler for a variant of the C programming language, with
|
||||
extensions for `single program, multiple data
|
||||
<http://en.wikipedia.org/wiki/SPMD>`_ programming. Under the SPMD model,
|
||||
the programmer writes a program that generally appears to be a regular
|
||||
serial program, though the execution model is actually that a number of
|
||||
*program instances* execute in parallel on the hardware.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
``ispc`` compiles a C-based SPMD programming language to run on the SIMD
|
||||
units of CPUs; it frequently provides a 3x or more speedup on CPUs with
|
||||
4-wide vector SSE units and 5x-6x on CPUs with 8-wide AVX vector units,
|
||||
without any of the difficulty of writing intrinsics code. Parallelization
|
||||
across multiple cores is also supported by ``ispc``, making it
|
||||
possible to write programs that achieve performance improvement that scales
|
||||
by both number of cores and vector unit size.
|
||||
|
||||
There are a few key principles in the design of ``ispc``:
|
||||
|
||||
* To build a small set of extensions to the C language that
|
||||
would deliver excellent performance to performance-oriented
|
||||
programmers who want to run SPMD programs on the CPU.
|
||||
|
||||
* To provide a thin abstraction layer between the programmer
|
||||
and the hardware--in particular, to have an execution and
|
||||
data model where the programmer can cleanly reason about the
|
||||
mapping of their source program to compiled assembly language
|
||||
and the underlying hardware.
|
||||
|
||||
* To make it possible to harness the computational power of SIMD
|
||||
vector units without the extremely low-programmer-productivity
|
||||
activity of directly writing intrinsics.
|
||||
|
||||
* To explore opportunities from close coupling between C/C++
|
||||
application code and SPMD ``ispc`` code running on the
|
||||
same processor--to have lightweight function calls between
|
||||
the two languages and to share data directly via pointers without
|
||||
copying or reformatting.
|
||||
|
||||
``ispc`` is an open source compiler with the BSD license. It uses the
|
||||
remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
|
||||
code generation and optimization and is `hosted on
|
||||
github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
|
||||
Linux, with both x86 and x86-64 targets. It currently supports the SSE2,
|
||||
SSE4, AVX1, and AVX2 instruction sets.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
``ispc`` provides a number of key features to developers:
|
||||
|
||||
* Familiarity as an extension of the C programming
|
||||
language: ``ispc`` supports familiar C syntax and
|
||||
programming idioms, while adding the ability to write SPMD
|
||||
programs.
|
||||
|
||||
* High-quality SIMD code generation: the performance
|
||||
of code generated by ``ispc`` is often close to that of
|
||||
hand-written intrinsics code.
|
||||
|
||||
* Ease of adoption with existing software
|
||||
systems: functions written in ``ispc`` directly
|
||||
interoperate with application functions written in C/C++ and
|
||||
with application data structures.
|
||||
|
||||
* Portability across over a decade of CPU
|
||||
generations: ``ispc`` has targets for SSE2, SSE4, AVX
|
||||
(and soon, AVX2).
|
||||
|
||||
* Portability across operating systems: Microsoft
|
||||
Windows, Mac OS X, and Linux are all supported
|
||||
by ``ispc``.
|
||||
|
||||
* Debugging with standard tools: ``ispc``
|
||||
programs can be debugged with standard debuggers (OS X and
|
||||
Linux only).
|
||||
|
||||
Additional Resources
|
||||
--------------------
|
||||
|
||||
Prebuilt ``ispc`` binaries for Windows, OS X and Linux can be downloaded
|
||||
from the `ispc downloads page <http://ispc.github.com/downloads.html>`_.
|
||||
See also additional
|
||||
`documentation <http://ispc.github.com/documentation.html>`_ and additional
|
||||
`performance information <http://ispc.github.com/perf.html>`_.
|
||||
22
README.txt
22
README.txt
@@ -1,22 +0,0 @@
|
||||
==============================
|
||||
Intel(r) SPMD Program Compiler
|
||||
==============================
|
||||
|
||||
Welcome to the Intel(r) SPMD Program Compiler (ispc)!
|
||||
|
||||
ispc is a new compiler for "single program, multiple data" (SPMD)
|
||||
programs. Under the SPMD model, the programmer writes a program that mostly
|
||||
appears to be a regular serial program, though the execution model is
|
||||
actually that a number of program instances execute in parallel on the
|
||||
hardware. ispc compiles a C-based SPMD programming language to run on the
|
||||
SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs
|
||||
with 4-wide SSE units, without any of the difficulty of writing intrinsics
|
||||
code.
|
||||
|
||||
ispc is an open source compiler under the BSD license; see the file
|
||||
LICENSE.txt. ispc supports Windows, Mac, and Linux, with both x86 and
|
||||
x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
|
||||
though support for AVX should be available soon.
|
||||
|
||||
For more information and examples, as well as a wiki and the bug database,
|
||||
see the ispc distribution site, http://ispc.github.com.
|
||||
483
ast.cpp
Normal file
483
ast.cpp
Normal file
@@ -0,0 +1,483 @@
|
||||
/*
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file ast.cpp
|
||||
|
||||
@brief General functionality related to abstract syntax trees and
|
||||
traversal of them.
|
||||
*/
|
||||
|
||||
#include "ast.h"
|
||||
#include "expr.h"
|
||||
#include "func.h"
|
||||
#include "stmt.h"
|
||||
#include "sym.h"
|
||||
#include "util.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// ASTNode
|
||||
|
||||
ASTNode::~ASTNode() {
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// AST
|
||||
|
||||
void
|
||||
AST::AddFunction(Symbol *sym, Stmt *code) {
|
||||
if (sym == NULL)
|
||||
return;
|
||||
functions.push_back(new Function(sym, code));
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
AST::GenerateIR() {
|
||||
for (unsigned int i = 0; i < functions.size(); ++i)
|
||||
functions[i]->GenerateIR();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
ASTNode *
|
||||
WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
void *data) {
|
||||
if (node == NULL)
|
||||
return node;
|
||||
|
||||
// Call the callback function
|
||||
if (preFunc != NULL) {
|
||||
if (preFunc(node, data) == false)
|
||||
// The function asked us to not continue recursively, so stop.
|
||||
return node;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Handle Statements
|
||||
if (dynamic_cast<Stmt *>(node) != NULL) {
|
||||
ExprStmt *es;
|
||||
DeclStmt *ds;
|
||||
IfStmt *is;
|
||||
DoStmt *dos;
|
||||
ForStmt *fs;
|
||||
ForeachStmt *fes;
|
||||
ForeachActiveStmt *fas;
|
||||
ForeachUniqueStmt *fus;
|
||||
CaseStmt *cs;
|
||||
DefaultStmt *defs;
|
||||
SwitchStmt *ss;
|
||||
ReturnStmt *rs;
|
||||
LabeledStmt *ls;
|
||||
StmtList *sl;
|
||||
PrintStmt *ps;
|
||||
AssertStmt *as;
|
||||
DeleteStmt *dels;
|
||||
UnmaskedStmt *ums;
|
||||
|
||||
if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
|
||||
es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
|
||||
else if ((ds = dynamic_cast<DeclStmt *>(node)) != NULL) {
|
||||
for (unsigned int i = 0; i < ds->vars.size(); ++i)
|
||||
ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((is = dynamic_cast<IfStmt *>(node)) != NULL) {
|
||||
is->test = (Expr *)WalkAST(is->test, preFunc, postFunc, data);
|
||||
is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc,
|
||||
postFunc, data);
|
||||
is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((dos = dynamic_cast<DoStmt *>(node)) != NULL) {
|
||||
dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc,
|
||||
postFunc, data);
|
||||
dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((fs = dynamic_cast<ForStmt *>(node)) != NULL) {
|
||||
fs->init = (Stmt *)WalkAST(fs->init, preFunc, postFunc, data);
|
||||
fs->test = (Expr *)WalkAST(fs->test, preFunc, postFunc, data);
|
||||
fs->step = (Stmt *)WalkAST(fs->step, preFunc, postFunc, data);
|
||||
fs->stmts = (Stmt *)WalkAST(fs->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((fes = dynamic_cast<ForeachStmt *>(node)) != NULL) {
|
||||
for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
|
||||
fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc,
|
||||
postFunc, data);
|
||||
for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
|
||||
fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc,
|
||||
postFunc, data);
|
||||
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
|
||||
fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
|
||||
fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
|
||||
fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
|
||||
cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
|
||||
else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
|
||||
defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
|
||||
else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
|
||||
ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
|
||||
ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if (dynamic_cast<BreakStmt *>(node) != NULL ||
|
||||
dynamic_cast<ContinueStmt *>(node) != NULL ||
|
||||
dynamic_cast<GotoStmt *>(node) != NULL) {
|
||||
// nothing
|
||||
}
|
||||
else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
|
||||
ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
|
||||
else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
|
||||
rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
|
||||
else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
|
||||
std::vector<Stmt *> &sls = sl->stmts;
|
||||
for (unsigned int i = 0; i < sls.size(); ++i)
|
||||
sls[i] = (Stmt *)WalkAST(sls[i], preFunc, postFunc, data);
|
||||
}
|
||||
else if ((ps = dynamic_cast<PrintStmt *>(node)) != NULL)
|
||||
ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
|
||||
else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
|
||||
as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
|
||||
else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
|
||||
dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
|
||||
else if ((ums = dynamic_cast<UnmaskedStmt *>(node)) != NULL)
|
||||
ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
|
||||
else
|
||||
FATAL("Unhandled statement type in WalkAST()");
|
||||
}
|
||||
else {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Handle expressions
|
||||
Assert(dynamic_cast<Expr *>(node) != NULL);
|
||||
UnaryExpr *ue;
|
||||
BinaryExpr *be;
|
||||
AssignExpr *ae;
|
||||
SelectExpr *se;
|
||||
ExprList *el;
|
||||
FunctionCallExpr *fce;
|
||||
IndexExpr *ie;
|
||||
MemberExpr *me;
|
||||
TypeCastExpr *tce;
|
||||
ReferenceExpr *re;
|
||||
PtrDerefExpr *ptrderef;
|
||||
RefDerefExpr *refderef;
|
||||
SizeOfExpr *soe;
|
||||
AddressOfExpr *aoe;
|
||||
NewExpr *newe;
|
||||
|
||||
if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
|
||||
ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
|
||||
else if ((be = dynamic_cast<BinaryExpr *>(node)) != NULL) {
|
||||
be->arg0 = (Expr *)WalkAST(be->arg0, preFunc, postFunc, data);
|
||||
be->arg1 = (Expr *)WalkAST(be->arg1, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((ae = dynamic_cast<AssignExpr *>(node)) != NULL) {
|
||||
ae->lvalue = (Expr *)WalkAST(ae->lvalue, preFunc, postFunc, data);
|
||||
ae->rvalue = (Expr *)WalkAST(ae->rvalue, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((se = dynamic_cast<SelectExpr *>(node)) != NULL) {
|
||||
se->test = (Expr *)WalkAST(se->test, preFunc, postFunc, data);
|
||||
se->expr1 = (Expr *)WalkAST(se->expr1, preFunc, postFunc, data);
|
||||
se->expr2 = (Expr *)WalkAST(se->expr2, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((el = dynamic_cast<ExprList *>(node)) != NULL) {
|
||||
for (unsigned int i = 0; i < el->exprs.size(); ++i)
|
||||
el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
|
||||
fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
|
||||
fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
|
||||
fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL) {
|
||||
ie->baseExpr = (Expr *)WalkAST(ie->baseExpr, preFunc, postFunc, data);
|
||||
ie->index = (Expr *)WalkAST(ie->index, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((me = dynamic_cast<MemberExpr *>(node)) != NULL)
|
||||
me->expr = (Expr *)WalkAST(me->expr, preFunc, postFunc, data);
|
||||
else if ((tce = dynamic_cast<TypeCastExpr *>(node)) != NULL)
|
||||
tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
|
||||
else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
|
||||
re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
|
||||
else if ((ptrderef = dynamic_cast<PtrDerefExpr *>(node)) != NULL)
|
||||
ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
|
||||
data);
|
||||
else if ((refderef = dynamic_cast<RefDerefExpr *>(node)) != NULL)
|
||||
refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
|
||||
data);
|
||||
else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
|
||||
soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
|
||||
else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
|
||||
aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
|
||||
else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
|
||||
newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc,
|
||||
postFunc, data);
|
||||
newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
|
||||
dynamic_cast<ConstExpr *>(node) != NULL ||
|
||||
dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
|
||||
dynamic_cast<SyncExpr *>(node) != NULL ||
|
||||
dynamic_cast<NullPointerExpr *>(node) != NULL) {
|
||||
// nothing to do
|
||||
}
|
||||
else
|
||||
FATAL("Unhandled expression type in WalkAST().");
|
||||
}
|
||||
|
||||
// Call the callback function
|
||||
if (postFunc != NULL)
|
||||
return postFunc(node, data);
|
||||
else
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
static ASTNode *
|
||||
lOptimizeNode(ASTNode *node, void *) {
|
||||
return node->Optimize();
|
||||
}
|
||||
|
||||
|
||||
ASTNode *
|
||||
Optimize(ASTNode *root) {
|
||||
return WalkAST(root, NULL, lOptimizeNode, NULL);
|
||||
}
|
||||
|
||||
|
||||
Expr *
|
||||
Optimize(Expr *expr) {
|
||||
return (Expr *)Optimize((ASTNode *)expr);
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
Optimize(Stmt *stmt) {
|
||||
return (Stmt *)Optimize((ASTNode *)stmt);
|
||||
}
|
||||
|
||||
|
||||
static ASTNode *
|
||||
lTypeCheckNode(ASTNode *node, void *) {
|
||||
return node->TypeCheck();
|
||||
}
|
||||
|
||||
|
||||
ASTNode *
|
||||
TypeCheck(ASTNode *root) {
|
||||
return WalkAST(root, NULL, lTypeCheckNode, NULL);
|
||||
}
|
||||
|
||||
|
||||
Expr *
|
||||
TypeCheck(Expr *expr) {
|
||||
return (Expr *)TypeCheck((ASTNode *)expr);
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
TypeCheck(Stmt *stmt) {
|
||||
return (Stmt *)TypeCheck((ASTNode *)stmt);
|
||||
}
|
||||
|
||||
|
||||
struct CostData {
|
||||
CostData() { cost = foreachDepth = 0; }
|
||||
|
||||
int cost;
|
||||
int foreachDepth;
|
||||
};
|
||||
|
||||
|
||||
static bool
|
||||
lCostCallbackPre(ASTNode *node, void *d) {
|
||||
CostData *data = (CostData *)d;
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL)
|
||||
++data->foreachDepth;
|
||||
if (data->foreachDepth == 0)
|
||||
data->cost += node->EstimateCost();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static ASTNode *
|
||||
lCostCallbackPost(ASTNode *node, void *d) {
|
||||
CostData *data = (CostData *)d;
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL)
|
||||
--data->foreachDepth;
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
EstimateCost(ASTNode *root) {
|
||||
CostData data;
|
||||
WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
|
||||
return data.cost;
|
||||
}
|
||||
|
||||
|
||||
/** Given an AST node, check to see if it's safe if we happen to run the
|
||||
code for that node with the execution mask all off.
|
||||
*/
|
||||
static bool
|
||||
lCheckAllOffSafety(ASTNode *node, void *data) {
|
||||
bool *okPtr = (bool *)data;
|
||||
|
||||
FunctionCallExpr *fce;
|
||||
if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
|
||||
if (fce->func == NULL)
|
||||
return false;
|
||||
|
||||
const Type *type = fce->func->GetType();
|
||||
const PointerType *pt = CastType<PointerType>(type);
|
||||
if (pt != NULL)
|
||||
type = pt->GetBaseType();
|
||||
const FunctionType *ftype = CastType<FunctionType>(type);
|
||||
Assert(ftype != NULL);
|
||||
|
||||
if (ftype->isSafe == false) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dynamic_cast<AssertStmt *>(node) != NULL) {
|
||||
// While it's fine to run the assert for varying tests, it's not
|
||||
// desirable to check an assert on a uniform variable if all of the
|
||||
// lanes are off.
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dynamic_cast<NewExpr *>(node) != NULL ||
|
||||
dynamic_cast<DeleteStmt *>(node) != NULL) {
|
||||
// We definitely don't want to run the uniform variants of these if
|
||||
// the mask is all off. It's also worth skipping the overhead of
|
||||
// executing the varying versions of them in the all-off mask case.
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dynamic_cast<ForeachStmt *>(node) != NULL ||
|
||||
dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
|
||||
dynamic_cast<ForeachUniqueStmt *>(node) != NULL ||
|
||||
dynamic_cast<UnmaskedStmt *>(node) != NULL) {
|
||||
// The various foreach statements also shouldn't be run with an
|
||||
// all-off mask. Since they can re-establish an 'all on' mask,
|
||||
// this would be pretty unintuitive. (More generally, it's
|
||||
// possibly a little strange to allow foreach in the presence of
|
||||
// any non-uniform control flow...)
|
||||
//
|
||||
// Similarly, the implementation of foreach_unique assumes as a
|
||||
// precondition that the mask won't be all off going into it, so
|
||||
// we'll enforce that here...
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
IndexExpr *ie;
|
||||
if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
|
||||
const Type *type = ie->baseExpr->GetType();
|
||||
if (type == NULL)
|
||||
return true;
|
||||
if (CastType<ReferenceType>(type) != NULL)
|
||||
type = type->GetReferenceTarget();
|
||||
|
||||
ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
|
||||
if (ce == NULL) {
|
||||
// indexing with a variable... -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
const PointerType *pointerType = CastType<PointerType>(type);
|
||||
if (pointerType != NULL) {
|
||||
// pointer[index] -> can't be sure -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
const SequentialType *seqType = CastType<SequentialType>(type);
|
||||
Assert(seqType != NULL);
|
||||
int nElements = seqType->GetElementCount();
|
||||
if (nElements == 0) {
|
||||
// Unsized array, so we can't be sure -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t indices[ISPC_MAX_NVEC];
|
||||
int count = ce->AsInt32(indices);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
if (indices[i] < 0 || indices[i] >= nElements) {
|
||||
// Index is out of bounds -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// All indices are in-bounds
|
||||
return true;
|
||||
}
|
||||
|
||||
MemberExpr *me;
|
||||
if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
|
||||
me->dereferenceExpr) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dynamic_cast<PtrDerefExpr *>(node) != NULL) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
SafeToRunWithMaskAllOff(ASTNode *root) {
|
||||
bool safe = true;
|
||||
WalkAST(root, lCheckAllOffSafety, NULL, &safe);
|
||||
return safe;
|
||||
}
|
||||
150
ast.h
Normal file
150
ast.h
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file ast.h
|
||||
@brief
|
||||
*/
|
||||
|
||||
#ifndef ISPC_AST_H
|
||||
#define ISPC_AST_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <vector>
|
||||
|
||||
/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
|
||||
|
||||
This class defines a basic interface that all abstract syntax tree
|
||||
(AST) nodes must implement. The base classes for both expressions
|
||||
(Expr) and statements (Stmt) inherit from this class.
|
||||
*/
|
||||
class ASTNode {
|
||||
public:
|
||||
ASTNode(SourcePos p) : pos(p) { }
|
||||
virtual ~ASTNode();
|
||||
|
||||
/** The Optimize() method should perform any appropriate early-stage
|
||||
optimizations on the node (e.g. constant folding). This method
|
||||
will be called after the node's children have already been
|
||||
optimized, and the caller will store the returned ASTNode * in
|
||||
place of the original node. This method should return NULL if an
|
||||
error is encountered during optimization. */
|
||||
virtual ASTNode *Optimize() = 0;
|
||||
|
||||
/** Type checking should be performed by the node when this method is
|
||||
called. In the event of an error, a NULL value may be returned.
|
||||
As with ASTNode::Optimize(), the caller should store the returned
|
||||
pointer in place of the original ASTNode *. */
|
||||
virtual ASTNode *TypeCheck() = 0;
|
||||
|
||||
/** Estimate the execution cost of the node (not including the cost of
|
||||
the children. The value returned should be based on the COST_*
|
||||
enumerant values defined in ispc.h. */
|
||||
virtual int EstimateCost() const = 0;
|
||||
|
||||
/** All AST nodes must track the file position where they are
|
||||
defined. */
|
||||
SourcePos pos;
|
||||
};
|
||||
|
||||
|
||||
/** Simple representation of the abstract syntax trees for all of the
|
||||
functions declared in a compilation unit.
|
||||
*/
|
||||
class AST {
|
||||
public:
|
||||
/** Add the AST for a function described by the given declaration
|
||||
information and source code. */
|
||||
void AddFunction(Symbol *sym, Stmt *code);
|
||||
|
||||
/** Generate LLVM IR for all of the functions into the current
|
||||
module. */
|
||||
void GenerateIR();
|
||||
|
||||
private:
|
||||
std::vector<Function *> functions;
|
||||
};
|
||||
|
||||
|
||||
/** Callback function type for preorder traversial visiting function for
|
||||
the AST walk.
|
||||
*/
|
||||
typedef bool (* ASTPreCallBackFunc)(ASTNode *node, void *data);
|
||||
|
||||
/** Callback function type for postorder traversial visiting function for
|
||||
the AST walk.
|
||||
*/
|
||||
typedef ASTNode * (* ASTPostCallBackFunc)(ASTNode *node, void *data);
|
||||
|
||||
/** Walk (some portion of) an AST, starting from the given root node. At
|
||||
each node, if preFunc is non-NULL, call it, passing the given void
|
||||
*data pointer; if the call to preFunc function returns false, then the
|
||||
children of the node aren't visited. This function then makes
|
||||
recursive calls to WalkAST() to process the node's children; after
|
||||
doing so, calls postFunc, at the node. The return value from the
|
||||
postFunc call is ignored. */
|
||||
extern ASTNode *WalkAST(ASTNode *root, ASTPreCallBackFunc preFunc,
|
||||
ASTPostCallBackFunc postFunc, void *data);
|
||||
|
||||
/** Perform simple optimizations on the AST or portion thereof passed to
|
||||
this function, returning the resulting AST. */
|
||||
extern ASTNode *Optimize(ASTNode *root);
|
||||
|
||||
/** Convenience version of Optimize() for Expr *s that returns an Expr *
|
||||
(rather than an ASTNode *, which would require the caller to cast back
|
||||
to an Expr *). */
|
||||
extern Expr *Optimize(Expr *);
|
||||
|
||||
/** Convenience version of Optimize() for Expr *s that returns an Stmt *
|
||||
(rather than an ASTNode *, which would require the caller to cast back
|
||||
to a Stmt *). */
|
||||
extern Stmt *Optimize(Stmt *);
|
||||
|
||||
/** Perform type-checking on the given AST (or portion of one), returning a
|
||||
pointer to the root of the resulting AST. */
|
||||
extern ASTNode *TypeCheck(ASTNode *root);
|
||||
|
||||
/** Convenience version of TypeCheck() for Expr *s that returns an Expr *. */
|
||||
extern Expr *TypeCheck(Expr *);
|
||||
|
||||
/** Convenience version of TypeCheck() for Stmt *s that returns an Stmt *. */
|
||||
extern Stmt *TypeCheck(Stmt *);
|
||||
|
||||
/** Returns an estimate of the execution cost of the tree starting at
|
||||
the given root. */
|
||||
extern int EstimateCost(ASTNode *root);
|
||||
|
||||
/** Returns true if it would be safe to run the given code with an "all
|
||||
off" mask. */
|
||||
extern bool SafeToRunWithMaskAllOff(ASTNode *root);
|
||||
|
||||
#endif // ISPC_AST_H
|
||||
@@ -4,30 +4,43 @@ import sys
|
||||
import string
|
||||
import re
|
||||
import subprocess
|
||||
import platform
|
||||
import os
|
||||
|
||||
length=0
|
||||
|
||||
src=str(sys.argv[1])
|
||||
|
||||
target = re.sub(".*builtins-", "", src)
|
||||
target = re.sub("builtins/target-", "", src)
|
||||
target = re.sub(r"builtins\\target-", "", target)
|
||||
target = re.sub("builtins/", "", target)
|
||||
target = re.sub(r"builtins\\", "", target)
|
||||
target = re.sub("\.ll$", "", target)
|
||||
target = re.sub("\.c$", "", target)
|
||||
target = re.sub("-", "_", target)
|
||||
|
||||
llvm_as="llvm-as"
|
||||
if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
|
||||
llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
|
||||
|
||||
try:
|
||||
as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||
as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||
except IOError:
|
||||
print >> sys.stderr, "Couldn't open " + src
|
||||
sys.stderr.write("Couldn't open " + src)
|
||||
sys.exit(1)
|
||||
|
||||
print "unsigned char builtins_bitcode_" + target + "[] = {"
|
||||
for line in as_out.stdout.readlines():
|
||||
length = length + len(line)
|
||||
for c in line:
|
||||
print ord(c)
|
||||
print ", "
|
||||
print " 0 };\n\n"
|
||||
print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
|
||||
width = 16;
|
||||
sys.stdout.write("unsigned char builtins_bitcode_" + target + "[] = {\n")
|
||||
|
||||
data = as_out.stdout.read()
|
||||
for i in range(0, len(data), 1):
|
||||
sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
|
||||
|
||||
if i%width == (width-1):
|
||||
sys.stdout.write("\n")
|
||||
|
||||
sys.stdout.write("0x00 };\n\n")
|
||||
sys.stdout.write("int builtins_bitcode_" + target + "_length = " + str(i+1) + ";\n")
|
||||
|
||||
as_out.wait()
|
||||
|
||||
|
||||
15
buildall.bat
Normal file
15
buildall.bat
Normal file
@@ -0,0 +1,15 @@
|
||||
@echo off
|
||||
|
||||
REM If LLVM_INSTALL_DIR isn't set globally in your environment,
|
||||
REM it can be set here_
|
||||
set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
|
||||
|
||||
REM Both the LLVM binaries and python need to be in the path
|
||||
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
|
||||
|
||||
msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
|
||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
|
||||
11
buildispc.bat
Normal file
11
buildispc.bat
Normal file
@@ -0,0 +1,11 @@
|
||||
@echo off
|
||||
|
||||
REM If LLVM_INSTALL_DIR isn't set globally in your environment,
|
||||
REM it can be set here_
|
||||
set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
|
||||
set LLVM_VERSION=3.1svn
|
||||
|
||||
REM Both the LLVM binaries and python need to be in the path
|
||||
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
|
||||
|
||||
msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
417
builtins-sse.ll
417
builtins-sse.ll
@@ -1,417 +0,0 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;; This file declares implementations of various stdlib builtins that
|
||||
;; only require SSE version 1 and 2 functionality; this file, in turn
|
||||
;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
|
||||
;; those definitions for them.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
int64minmax(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <4 x float> %0, %call
|
||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
||||
ret <4 x float> %iv_mul
|
||||
}
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; do the rcpss call
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration to improve precision, as above
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <4 x float> %v, %is
|
||||
%v_is_is = fmul <4 x float> %v_is, %is
|
||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <4 x float> %is, %three_sub
|
||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <4 x float> %half_scale
|
||||
}
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math mode
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||
store <4 x float> %s, <4 x float> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = add <4 x i32> %v1, %v
|
||||
%m1a = extractelement <4 x i32> %m1, i32 0
|
||||
%m1b = extractelement <4 x i32> %m1, i32 1
|
||||
%sum = add i32 %m1a, %m1b
|
||||
ret i32 %sum
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = fadd <2 x double> %v0, %v1
|
||||
%e0 = extractelement <2 x double> %sum, i32 0
|
||||
%e1 = extractelement <2 x double> %sum, i32 1
|
||||
%m = fadd double %e0, %e1
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = add <2 x i64> %v0, %v1
|
||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||
%m = add i64 %e0, %e1
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
load_masked(4, i8, 8, 1)
|
||||
load_masked(4, i16, 16, 2)
|
||||
load_masked(4, i32, 32, 4)
|
||||
load_masked(4, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
357
builtins-sse2.ll
357
builtins-sse2.ll
@@ -1,357 +0,0 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Define the standard library builtins for the SSE2 target
|
||||
|
||||
; Define some basics for a 4-wide target
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
|
||||
; Include the various definitions of things that only require SSE1 and SSE2
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
|
||||
%bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||
%bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
|
||||
%binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
|
||||
%binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
|
||||
%bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
|
||||
ret <4 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
|
||||
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
|
||||
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
|
||||
%binop21.i = fadd float %binop.i, -8.388608e+06
|
||||
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
|
||||
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
|
||||
ret float %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
||||
%bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
|
||||
%binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <4 x float> %binop.i
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp ogt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, -1082130432
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
||||
%bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
|
||||
%binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <4 x float> %binop.i
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp olt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, 1065353216
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare double @round(double)
|
||||
declare double @floor(double)
|
||||
declare double @ceil(double)
|
||||
|
||||
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @round)
|
||||
}
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @round(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @floor)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @floor(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @ceil)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @ceil(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
; There is no blend instruction with SSE2, so we simulate it with bit
|
||||
; operations on i32s. For these two vselect functions, for each
|
||||
; vector element, if the mask is on, we return the corresponding value
|
||||
; from %1, and otherwise return the value from %0.
|
||||
|
||||
define internal <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
|
||||
%cleared_old = and <4 x i32> %0, %notmask
|
||||
%masked_new = and <4 x i32> %1, %mask
|
||||
%new = or <4 x i32> %cleared_old, %masked_new
|
||||
ret <4 x i32> %new
|
||||
}
|
||||
|
||||
define internal <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%v0 = bitcast <4 x float> %0 to <4 x i32>
|
||||
%v1 = bitcast <4 x float> %1 to <4 x i32>
|
||||
%r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
|
||||
%rf = bitcast <4 x i32> %r to <4 x float>
|
||||
ret <4 x float> %rf
|
||||
}
|
||||
|
||||
|
||||
; To do vector integer min and max, we do the vector compare and then sign
|
||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||
; rest...
|
||||
|
||||
define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp slt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp slt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; The functions for unsigned ints are similar, just with unsigned
|
||||
; comparison functions...
|
||||
|
||||
define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ult <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ult i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32)
|
||||
declare i64 @llvm.ctpop.i64(i64)
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%val = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
%val = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %val
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = fadd <4 x float> %v1, %v
|
||||
%m1a = extractelement <4 x float> %m1, i32 0
|
||||
%m1b = extractelement <4 x float> %m1, i32 1
|
||||
%sum = fadd float %m1a, %m1b
|
||||
ret float %sum
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <4 x i32> * %0, align 4
|
||||
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
|
||||
store <4 x i32> %newval, <4 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
|
||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||
; are actually bitcast <2 x i64> values
|
||||
;
|
||||
; set up the first two 64-bit values
|
||||
%old01 = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%old01f = bitcast <2 x i64> %old01 to <4 x float>
|
||||
%new01 = shufflevector <4 x i64> %new, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%new01f = bitcast <2 x i64> %new01 to <4 x float>
|
||||
; compute mask--note that the indices 0 and 1 are doubled-up
|
||||
%mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
|
||||
<4 x i32> <i32 0, i32 0, i32 1, i32 1>
|
||||
; and blend the two of the values
|
||||
%result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
|
||||
%result01 = bitcast <4 x float> %result01f to <2 x i64>
|
||||
|
||||
; and again
|
||||
%old23 = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%old23f = bitcast <2 x i64> %old23 to <4 x float>
|
||||
%new23 = shufflevector <4 x i64> %new, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%new23f = bitcast <2 x i64> %new23 to <4 x float>
|
||||
; compute mask--note that the values 2 and 3 are doubled-up
|
||||
%mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
|
||||
<4 x i32> <i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend the two of the values
|
||||
%result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
|
||||
%result23 = bitcast <4 x float> %result23f to <2 x i64>
|
||||
|
||||
; reconstruct the final <4 x i64> vector
|
||||
%final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
store <4 x i64> %final, <4 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
300
builtins-sse4.ll
300
builtins-sse4.ll
@@ -1,300 +0,0 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; Define common 4-wide stuff
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
|
||||
; Define the stuff that can be done with base SSE1/SSE2 instructions
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both. Further, only the 0th
|
||||
; element of the b parameter matters
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
round2to4double(%0, 8)
|
||||
}
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round2to4double(%0, 9)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round2to4double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
|
||||
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
|
||||
%scalar = extractelement <4 x float> %v2, i32 0
|
||||
ret float %scalar
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
||||
%oldValue = load <4 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
|
||||
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
|
||||
<4 x float> %newAsFloat,
|
||||
<4 x float> %mask_as_float)
|
||||
%blendAsInt = bitcast <4 x float> %blend to <4 x i32>
|
||||
store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
||||
|
||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||
; are actually bitcast <2 x i64> values
|
||||
;
|
||||
; set up the first two 64-bit values
|
||||
%old01 = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%old01f = bitcast <2 x i64> %old01 to <4 x float>
|
||||
%new01 = shufflevector <4 x i64> %new, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%new01f = bitcast <2 x i64> %new01 to <4 x float>
|
||||
; compute mask--note that the indices 0 and 1 are doubled-up
|
||||
%mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
|
||||
<4 x i32> <i32 0, i32 0, i32 1, i32 1>
|
||||
; and blend the two of the values
|
||||
%result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
|
||||
<4 x float> %new01f,
|
||||
<4 x float> %mask01)
|
||||
%result01 = bitcast <4 x float> %result01f to <2 x i64>
|
||||
|
||||
; and again
|
||||
%old23 = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%old23f = bitcast <2 x i64> %old23 to <4 x float>
|
||||
%new23 = shufflevector <4 x i64> %new, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%new23f = bitcast <2 x i64> %new23 to <4 x float>
|
||||
; compute mask--note that the values 2 and 3 are doubled-up
|
||||
%mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
|
||||
<4 x i32> <i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend the two of the values
|
||||
%result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
|
||||
<4 x float> %new23f,
|
||||
<4 x float> %mask23)
|
||||
%result23 = bitcast <4 x float> %result23f to <2 x i64>
|
||||
|
||||
; reconstruct the final <4 x i64> vector
|
||||
%final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
store <4 x i64> %final, <4 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
690
builtins.cpp
690
builtins.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -99,6 +99,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
|
||||
|
||||
// varying
|
||||
if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
|
||||
t == LLVMTypes::MaskType)
|
||||
return AtomicType::VaryingBool;
|
||||
else if (t == LLVMTypes::Int8VectorType)
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
|
||||
else if (t == LLVMTypes::Int16VectorType)
|
||||
@@ -114,59 +117,39 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
|
||||
// pointers to uniform
|
||||
else if (t == LLVMTypes::Int8PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
|
||||
AtomicType::UniformInt8, false);
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt8 :
|
||||
AtomicType::UniformInt8);
|
||||
else if (t == LLVMTypes::Int16PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
|
||||
AtomicType::UniformInt16, false);
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt16 :
|
||||
AtomicType::UniformInt16);
|
||||
else if (t == LLVMTypes::Int32PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
|
||||
AtomicType::UniformInt32, false);
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt32 :
|
||||
AtomicType::UniformInt32);
|
||||
else if (t == LLVMTypes::Int64PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
|
||||
AtomicType::UniformInt64, false);
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt64 :
|
||||
AtomicType::UniformInt64);
|
||||
else if (t == LLVMTypes::FloatPointerType)
|
||||
return new ReferenceType(AtomicType::UniformFloat, false);
|
||||
return PointerType::GetUniform(AtomicType::UniformFloat);
|
||||
else if (t == LLVMTypes::DoublePointerType)
|
||||
return new ReferenceType(AtomicType::UniformDouble, false);
|
||||
return PointerType::GetUniform(AtomicType::UniformDouble);
|
||||
|
||||
// pointers to varying
|
||||
else if (t == LLVMTypes::Int8VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
|
||||
AtomicType::VaryingInt8, false);
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt8 :
|
||||
AtomicType::VaryingInt8);
|
||||
else if (t == LLVMTypes::Int16VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
|
||||
AtomicType::VaryingInt16, false);
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt16 :
|
||||
AtomicType::VaryingInt16);
|
||||
else if (t == LLVMTypes::Int32VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
|
||||
AtomicType::VaryingInt32, false);
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt32 :
|
||||
AtomicType::VaryingInt32);
|
||||
else if (t == LLVMTypes::Int64VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
|
||||
AtomicType::VaryingInt64, false);
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt64 :
|
||||
AtomicType::VaryingInt64);
|
||||
else if (t == LLVMTypes::FloatVectorPointerType)
|
||||
return new ReferenceType(AtomicType::VaryingFloat, false);
|
||||
return PointerType::GetUniform(AtomicType::VaryingFloat);
|
||||
else if (t == LLVMTypes::DoubleVectorPointerType)
|
||||
return new ReferenceType(AtomicType::VaryingDouble, false);
|
||||
|
||||
// arrays
|
||||
else if (llvm::isa<const llvm::PointerType>(t)) {
|
||||
const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
|
||||
|
||||
// Is it a pointer to an unsized array of objects? If so, then
|
||||
// create the equivalent ispc type. Note that it has to be a
|
||||
// reference to an array, since ispc passes arrays to functions by
|
||||
// reference.
|
||||
const llvm::ArrayType *at =
|
||||
llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
|
||||
if (at != NULL) {
|
||||
const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
|
||||
intAsUnsigned);
|
||||
if (eltType == NULL)
|
||||
return NULL;
|
||||
return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
|
||||
false);
|
||||
}
|
||||
}
|
||||
return PointerType::GetUniform(AtomicType::VaryingDouble);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@@ -174,18 +157,16 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
|
||||
static void
|
||||
lCreateSymbol(const std::string &name, const Type *returnType,
|
||||
const std::vector<const Type *> &argTypes,
|
||||
llvm::SmallVector<const Type *, 8> &argTypes,
|
||||
const llvm::FunctionType *ftype, llvm::Function *func,
|
||||
SymbolTable *symbolTable) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
// set NULL default arguments
|
||||
std::vector<ConstExpr *> defaults;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
|
||||
defaults.push_back(NULL);
|
||||
funcType->SetArgumentDefaults(defaults);
|
||||
|
||||
Debug(noPos, "Created builtin symbol \"%s\" [%s]\n", name.c_str(),
|
||||
funcType->GetString().c_str());
|
||||
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
@@ -208,20 +189,20 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
if (name.size() < 3 || name[0] != '_' || name[1] != '_')
|
||||
return false;
|
||||
|
||||
Debug(SourcePos(), "Attempting to create ispc symbol for function \"%s\".",
|
||||
name.c_str());
|
||||
|
||||
// An unfortunate hack: we want this builtin function to have the
|
||||
// signature "int __sext_varying_bool(bool)", but the ispc function
|
||||
// symbol creation code below assumes that any LLVM vector of i32s is a
|
||||
// varying int32. Here, we need that to be interpreted as a varying
|
||||
// bool, so just have a one-off override for that one...
|
||||
if (name == "__sext_varying_bool") {
|
||||
if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
|
||||
const Type *returnType = AtomicType::VaryingInt32;
|
||||
std::vector<const Type *> argTypes;
|
||||
llvm::SmallVector<const Type *, 8> argTypes;
|
||||
argTypes.push_back(AtomicType::VaryingBool);
|
||||
std::vector<ConstExpr *> defaults;
|
||||
defaults.push_back(NULL);
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
funcType->SetArgumentDefaults(defaults);
|
||||
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
@@ -238,22 +219,27 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
|
||||
const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
|
||||
intAsUnsigned);
|
||||
if (!returnType)
|
||||
if (returnType == NULL) {
|
||||
Debug(SourcePos(), "Failed: return type not representable for "
|
||||
"builtin %s.", name.c_str());
|
||||
// return type not representable in ispc -> not callable from ispc
|
||||
return false;
|
||||
}
|
||||
|
||||
// Iterate over the arguments and try to find their equivalent ispc
|
||||
// types. Track if any of the arguments has an integer type.
|
||||
bool anyIntArgs = false, anyReferenceArgs = false;
|
||||
std::vector<const Type *> argTypes;
|
||||
bool anyIntArgs = false;
|
||||
llvm::SmallVector<const Type *, 8> argTypes;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
|
||||
const llvm::Type *llvmArgType = ftype->getParamType(j);
|
||||
const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
|
||||
if (type == NULL)
|
||||
if (type == NULL) {
|
||||
Debug(SourcePos(), "Failed: type of parameter %d not "
|
||||
"representable for builtin %s", j, name.c_str());
|
||||
return false;
|
||||
}
|
||||
anyIntArgs |=
|
||||
(Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
|
||||
anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
|
||||
argTypes.push_back(type);
|
||||
}
|
||||
|
||||
@@ -261,19 +247,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
// so that we get symbols for things with no integer types!
|
||||
if (i == 0 || anyIntArgs == true)
|
||||
lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
|
||||
|
||||
// If there are any reference types, also make a variant of the
|
||||
// symbol that has them as const references. This obviously
|
||||
// doesn't make sense for many builtins, but we'll give the stdlib
|
||||
// the option to call one if it needs one.
|
||||
if (anyReferenceArgs == true) {
|
||||
for (unsigned int j = 0; j < argTypes.size(); ++j) {
|
||||
if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
|
||||
argTypes[j] = argTypes[j]->GetAsConstType();
|
||||
lCreateSymbol(name + "_refsconst", returnType, argTypes,
|
||||
ftype, func, symbolTable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -287,7 +260,7 @@ static void
|
||||
lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
#if 0
|
||||
// FIXME: handle globals?
|
||||
assert(module->global_empty());
|
||||
Assert(module->global_empty());
|
||||
#endif
|
||||
|
||||
llvm::Module::iterator iter;
|
||||
@@ -317,16 +290,300 @@ lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
// check the llvm.x86.* intrinsics for now...
|
||||
if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
|
||||
llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
|
||||
assert(id != 0);
|
||||
LLVM_TYPE_CONST llvm::Type *intrinsicType =
|
||||
Assert(id != 0);
|
||||
llvm::Type *intrinsicType =
|
||||
llvm::Intrinsic::getType(*g->ctx, id);
|
||||
intrinsicType = llvm::PointerType::get(intrinsicType, 0);
|
||||
assert(func->getType() == intrinsicType);
|
||||
Assert(func->getType() == intrinsicType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** We'd like to have all of these functions declared as 'internal' in
|
||||
their respective bitcode files so that if they aren't needed by the
|
||||
user's program they are elimiated from the final output. However, if
|
||||
we do so, then they aren't brought in by the LinkModules() call below
|
||||
since they aren't yet used by anything in the module they're being
|
||||
linked with (in LLVM 3.1, at least).
|
||||
|
||||
Therefore, we don't declare them as internal when we first define them,
|
||||
but instead mark them as internal after they've been linked in. This
|
||||
is admittedly a kludge.
|
||||
*/
|
||||
static void
|
||||
lSetInternalFunctions(llvm::Module *module) {
|
||||
const char *names[] = {
|
||||
"__add_float",
|
||||
"__add_int32",
|
||||
"__add_uniform_double",
|
||||
"__add_uniform_int32",
|
||||
"__add_uniform_int64",
|
||||
"__add_varying_double",
|
||||
"__add_varying_int32",
|
||||
"__add_varying_int64",
|
||||
"__aos_to_soa3_float",
|
||||
"__aos_to_soa3_float16",
|
||||
"__aos_to_soa3_float4",
|
||||
"__aos_to_soa3_float8",
|
||||
"__aos_to_soa3_int32",
|
||||
"__aos_to_soa4_float",
|
||||
"__aos_to_soa4_float16",
|
||||
"__aos_to_soa4_float4",
|
||||
"__aos_to_soa4_float8",
|
||||
"__aos_to_soa4_int32",
|
||||
"__atomic_add_int32_global",
|
||||
"__atomic_add_int64_global",
|
||||
"__atomic_add_uniform_int32_global",
|
||||
"__atomic_add_uniform_int64_global",
|
||||
"__atomic_and_int32_global",
|
||||
"__atomic_and_int64_global",
|
||||
"__atomic_and_uniform_int32_global",
|
||||
"__atomic_and_uniform_int64_global",
|
||||
"__atomic_compare_exchange_double_global",
|
||||
"__atomic_compare_exchange_float_global",
|
||||
"__atomic_compare_exchange_int32_global",
|
||||
"__atomic_compare_exchange_int64_global",
|
||||
"__atomic_compare_exchange_uniform_double_global",
|
||||
"__atomic_compare_exchange_uniform_float_global",
|
||||
"__atomic_compare_exchange_uniform_int32_global",
|
||||
"__atomic_compare_exchange_uniform_int64_global",
|
||||
"__atomic_max_uniform_int32_global",
|
||||
"__atomic_max_uniform_int64_global",
|
||||
"__atomic_min_uniform_int32_global",
|
||||
"__atomic_min_uniform_int64_global",
|
||||
"__atomic_or_int32_global",
|
||||
"__atomic_or_int64_global",
|
||||
"__atomic_or_uniform_int32_global",
|
||||
"__atomic_or_uniform_int64_global",
|
||||
"__atomic_sub_int32_global",
|
||||
"__atomic_sub_int64_global",
|
||||
"__atomic_sub_uniform_int32_global",
|
||||
"__atomic_sub_uniform_int64_global",
|
||||
"__atomic_swap_double_global",
|
||||
"__atomic_swap_float_global",
|
||||
"__atomic_swap_int32_global",
|
||||
"__atomic_swap_int64_global",
|
||||
"__atomic_swap_uniform_double_global",
|
||||
"__atomic_swap_uniform_float_global",
|
||||
"__atomic_swap_uniform_int32_global",
|
||||
"__atomic_swap_uniform_int64_global",
|
||||
"__atomic_umax_uniform_uint32_global",
|
||||
"__atomic_umax_uniform_uint64_global",
|
||||
"__atomic_umin_uniform_uint32_global",
|
||||
"__atomic_umin_uniform_uint64_global",
|
||||
"__atomic_xor_int32_global",
|
||||
"__atomic_xor_int64_global",
|
||||
"__atomic_xor_uniform_int32_global",
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
"__broadcast_double",
|
||||
"__broadcast_float",
|
||||
"__broadcast_i16",
|
||||
"__broadcast_i32",
|
||||
"__broadcast_i64",
|
||||
"__broadcast_i8",
|
||||
"__ceil_uniform_double",
|
||||
"__ceil_uniform_float",
|
||||
"__ceil_varying_double",
|
||||
"__ceil_varying_float",
|
||||
"__clock",
|
||||
"__count_trailing_zeros_i32",
|
||||
"__count_trailing_zeros_i64",
|
||||
"__count_leading_zeros_i32",
|
||||
"__count_leading_zeros_i64",
|
||||
"__delete_uniform",
|
||||
"__delete_varying",
|
||||
"__do_assert_uniform",
|
||||
"__do_assert_varying",
|
||||
"__do_print",
|
||||
"__doublebits_uniform_int64",
|
||||
"__doublebits_varying_int64",
|
||||
"__exclusive_scan_add_double",
|
||||
"__exclusive_scan_add_float",
|
||||
"__exclusive_scan_add_i32",
|
||||
"__exclusive_scan_add_i64",
|
||||
"__exclusive_scan_and_i32",
|
||||
"__exclusive_scan_and_i64",
|
||||
"__exclusive_scan_or_i32",
|
||||
"__exclusive_scan_or_i64",
|
||||
"__extract_int16",
|
||||
"__extract_int32",
|
||||
"__extract_int64",
|
||||
"__extract_int8",
|
||||
"__fastmath",
|
||||
"__float_to_half_uniform",
|
||||
"__float_to_half_varying",
|
||||
"__floatbits_uniform_int32",
|
||||
"__floatbits_varying_int32",
|
||||
"__floor_uniform_double",
|
||||
"__floor_uniform_float",
|
||||
"__floor_varying_double",
|
||||
"__floor_varying_float",
|
||||
"__half_to_float_uniform",
|
||||
"__half_to_float_varying",
|
||||
"__insert_int16",
|
||||
"__insert_int32",
|
||||
"__insert_int64",
|
||||
"__insert_int8",
|
||||
"__intbits_uniform_double",
|
||||
"__intbits_uniform_float",
|
||||
"__intbits_varying_double",
|
||||
"__intbits_varying_float",
|
||||
"__max_uniform_double",
|
||||
"__max_uniform_float",
|
||||
"__max_uniform_int32",
|
||||
"__max_uniform_int64",
|
||||
"__max_uniform_uint32",
|
||||
"__max_uniform_uint64",
|
||||
"__max_varying_double",
|
||||
"__max_varying_float",
|
||||
"__max_varying_int32",
|
||||
"__max_varying_int64",
|
||||
"__max_varying_uint32",
|
||||
"__max_varying_uint64",
|
||||
"__memory_barrier",
|
||||
"__memcpy32",
|
||||
"__memcpy64",
|
||||
"__memmove32",
|
||||
"__memmove64",
|
||||
"__memset32",
|
||||
"__memset64",
|
||||
"__min_uniform_double",
|
||||
"__min_uniform_float",
|
||||
"__min_uniform_int32",
|
||||
"__min_uniform_int64",
|
||||
"__min_uniform_uint32",
|
||||
"__min_uniform_uint64",
|
||||
"__min_varying_double",
|
||||
"__min_varying_float",
|
||||
"__min_varying_int32",
|
||||
"__min_varying_int64",
|
||||
"__min_varying_uint32",
|
||||
"__min_varying_uint64",
|
||||
"__movmsk",
|
||||
"__new_uniform",
|
||||
"__new_varying32",
|
||||
"__new_varying64",
|
||||
"__num_cores",
|
||||
"__packed_load_active",
|
||||
"__packed_store_active",
|
||||
"__pause",
|
||||
"__popcnt_int32",
|
||||
"__popcnt_int64",
|
||||
"__prefetch_read_uniform_1",
|
||||
"__prefetch_read_uniform_2",
|
||||
"__prefetch_read_uniform_3",
|
||||
"__prefetch_read_uniform_nt",
|
||||
"__rcp_uniform_float",
|
||||
"__rcp_varying_float",
|
||||
"__reduce_add_double",
|
||||
"__reduce_add_float",
|
||||
"__reduce_add_int32",
|
||||
"__reduce_add_int64",
|
||||
"__reduce_add_uint32",
|
||||
"__reduce_add_uint64",
|
||||
"__reduce_equal_double",
|
||||
"__reduce_equal_float",
|
||||
"__reduce_equal_int32",
|
||||
"__reduce_equal_int64",
|
||||
"__reduce_max_double",
|
||||
"__reduce_max_float",
|
||||
"__reduce_max_int32",
|
||||
"__reduce_max_int64",
|
||||
"__reduce_max_uint32",
|
||||
"__reduce_max_uint64",
|
||||
"__reduce_min_double",
|
||||
"__reduce_min_float",
|
||||
"__reduce_min_int32",
|
||||
"__reduce_min_int64",
|
||||
"__reduce_min_uint32",
|
||||
"__reduce_min_uint64",
|
||||
"__rotate_double",
|
||||
"__rotate_float",
|
||||
"__rotate_i16",
|
||||
"__rotate_i32",
|
||||
"__rotate_i64",
|
||||
"__rotate_i8",
|
||||
"__round_uniform_double",
|
||||
"__round_uniform_float",
|
||||
"__round_varying_double",
|
||||
"__round_varying_float",
|
||||
"__rsqrt_uniform_float",
|
||||
"__rsqrt_varying_float",
|
||||
"__sext_uniform_bool",
|
||||
"__sext_varying_bool",
|
||||
"__shuffle2_double",
|
||||
"__shuffle2_float",
|
||||
"__shuffle2_i16",
|
||||
"__shuffle2_i32",
|
||||
"__shuffle2_i64",
|
||||
"__shuffle2_i8",
|
||||
"__shuffle_double",
|
||||
"__shuffle_float",
|
||||
"__shuffle_i16",
|
||||
"__shuffle_i32",
|
||||
"__shuffle_i64",
|
||||
"__shuffle_i8",
|
||||
"__soa_to_aos3_float",
|
||||
"__soa_to_aos3_float16",
|
||||
"__soa_to_aos3_float4",
|
||||
"__soa_to_aos3_float8",
|
||||
"__soa_to_aos3_int32",
|
||||
"__soa_to_aos4_float",
|
||||
"__soa_to_aos4_float16",
|
||||
"__soa_to_aos4_float4",
|
||||
"__soa_to_aos4_float8",
|
||||
"__soa_to_aos4_int32",
|
||||
"__sqrt_uniform_double",
|
||||
"__sqrt_uniform_float",
|
||||
"__sqrt_varying_double",
|
||||
"__sqrt_varying_float",
|
||||
"__stdlib_acosf",
|
||||
"__stdlib_asinf",
|
||||
"__stdlib_atan",
|
||||
"__stdlib_atan2",
|
||||
"__stdlib_atan2f",
|
||||
"__stdlib_atanf",
|
||||
"__stdlib_cos",
|
||||
"__stdlib_cosf",
|
||||
"__stdlib_exp",
|
||||
"__stdlib_expf",
|
||||
"__stdlib_log",
|
||||
"__stdlib_logf",
|
||||
"__stdlib_pow",
|
||||
"__stdlib_powf",
|
||||
"__stdlib_sin",
|
||||
"__stdlib_sincos",
|
||||
"__stdlib_sincosf",
|
||||
"__stdlib_sinf",
|
||||
"__stdlib_tan",
|
||||
"__stdlib_tanf",
|
||||
"__svml_sin",
|
||||
"__svml_cos",
|
||||
"__svml_sincos",
|
||||
"__svml_tan",
|
||||
"__svml_atan",
|
||||
"__svml_atan2",
|
||||
"__svml_exp",
|
||||
"__svml_log",
|
||||
"__svml_pow",
|
||||
"__undef_uniform",
|
||||
"__undef_varying",
|
||||
"__vec4_add_float",
|
||||
"__vec4_add_int32",
|
||||
"__vselect_float",
|
||||
"__vselect_i32",
|
||||
};
|
||||
|
||||
int count = sizeof(names) / sizeof(names[0]);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
llvm::Function *f = module->getFunction(names[i]);
|
||||
if (f != NULL && f->empty() == false)
|
||||
f->setLinkage(llvm::GlobalValue::InternalLinkage);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** This utility function takes serialized binary LLVM bitcode and adds its
|
||||
definitions to the given module. Functions in the bitcode that can be
|
||||
mapped to ispc functions are also added to the symbol table.
|
||||
@@ -336,9 +593,9 @@ lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
@param module Module to link the bitcode into
|
||||
@param symbolTable Symbol table to add definitions to
|
||||
*/
|
||||
static void
|
||||
lAddBitcode(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable) {
|
||||
void
|
||||
AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable) {
|
||||
std::string bcErr;
|
||||
llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
|
||||
llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
|
||||
@@ -356,16 +613,20 @@ lAddBitcode(const unsigned char *bitcode, int length,
|
||||
// linking together modules with incompatible target triples..
|
||||
llvm::Triple mTriple(m->module->getTargetTriple());
|
||||
llvm::Triple bcTriple(bcModule->getTargetTriple());
|
||||
assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
||||
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
||||
mTriple.getArch() == bcTriple.getArch());
|
||||
assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
|
||||
Assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
|
||||
mTriple.getVendor() == bcTriple.getVendor());
|
||||
bcModule->setTargetTriple(mTriple.str());
|
||||
|
||||
std::string(linkError);
|
||||
if (llvm::Linker::LinkModules(module, bcModule, &linkError))
|
||||
if (llvm::Linker::LinkModules(module, bcModule,
|
||||
llvm::Linker::DestroySource,
|
||||
&linkError))
|
||||
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
||||
lAddModuleSymbols(module, symbolTable);
|
||||
lSetInternalFunctions(module);
|
||||
if (symbolTable != NULL)
|
||||
lAddModuleSymbols(module, symbolTable);
|
||||
lCheckModuleIntrinsics(module);
|
||||
}
|
||||
}
|
||||
@@ -377,35 +638,93 @@ lAddBitcode(const unsigned char *bitcode, int length,
|
||||
static void
|
||||
lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
|
||||
pw->isStatic = true;
|
||||
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
Symbol *sym =
|
||||
new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
|
||||
SC_STATIC);
|
||||
sym->constValue = new ConstExpr(sym->type, val, SourcePos());
|
||||
llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
llvm::Constant *linit = LLVMInt32(val);
|
||||
pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
linit, pw->name.c_str());
|
||||
symbolTable->AddVariable(pw);
|
||||
// Use WeakODRLinkage rather than InternalLinkage so that a definition
|
||||
// survives even if it's not used in the module, so that the symbol is
|
||||
// there in the debugger.
|
||||
llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
|
||||
llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
|
||||
sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
|
||||
linit, name);
|
||||
symbolTable->AddVariable(sym);
|
||||
|
||||
if (m->diBuilder != NULL) {
|
||||
llvm::DIFile file;
|
||||
llvm::DIType diType = sym->type->GetDIType(file);
|
||||
Assert(diType.Verify());
|
||||
// FIXME? DWARF says that this (and programIndex below) should
|
||||
// have the DW_AT_artifical attribute. It's not clear if this
|
||||
// matters for anything though.
|
||||
llvm::DIGlobalVariable var =
|
||||
m->diBuilder->createGlobalVariable(name,
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
true /* static */,
|
||||
sym->storagePtr);
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
llvm::SmallVector<const Type *, 8> args;
|
||||
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||
Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
|
||||
|
||||
llvm::Function *func = module->getFunction(name);
|
||||
Assert(func != NULL); // it should be declared already...
|
||||
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
|
||||
llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
|
||||
|
||||
sym->function = func;
|
||||
symbolTable->AddVariable(sym);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||
AtomicType::VaryingConstInt32);
|
||||
pidx->isStatic = true;
|
||||
Symbol *sym =
|
||||
new Symbol("programIndex", SourcePos(),
|
||||
AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);
|
||||
|
||||
int pi[ISPC_MAX_NVEC];
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
pi[i] = i;
|
||||
pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
|
||||
sym->constValue = new ConstExpr(sym->type, pi, SourcePos());
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
llvm::Constant *linit = LLVMInt32Vector(pi);
|
||||
pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage, linit,
|
||||
pidx->name.c_str());
|
||||
symbolTable->AddVariable(pidx);
|
||||
// See comment in lDefineConstantInt() for why WeakODRLinkage is used here
|
||||
llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
|
||||
llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
|
||||
sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
|
||||
linit, sym->name.c_str());
|
||||
symbolTable->AddVariable(sym);
|
||||
|
||||
if (m->diBuilder != NULL) {
|
||||
llvm::DIFile file;
|
||||
llvm::DIType diType = sym->type->GetDIType(file);
|
||||
Assert(diType.Verify());
|
||||
llvm::DIGlobalVariable var =
|
||||
m->diBuilder->createGlobalVariable(sym->name.c_str(),
|
||||
file,
|
||||
0 /* line */,
|
||||
diType,
|
||||
false /* static */,
|
||||
sym->storagePtr);
|
||||
Assert(var.Verify());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -413,17 +732,17 @@ void
|
||||
DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||
bool includeStdlibISPC) {
|
||||
// Add the definitions from the compiled builtins-c.c file
|
||||
if (g->target.is32bit) {
|
||||
if (g->target.is32Bit) {
|
||||
extern unsigned char builtins_bitcode_c_32[];
|
||||
extern int builtins_bitcode_c_32_length;
|
||||
lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
else {
|
||||
extern unsigned char builtins_bitcode_c_64[];
|
||||
extern int builtins_bitcode_c_64_length;
|
||||
lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
|
||||
// Next, add the target's custom implementations of the various needed
|
||||
@@ -432,22 +751,36 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
case Target::SSE2:
|
||||
extern unsigned char builtins_bitcode_sse2[];
|
||||
extern int builtins_bitcode_sse2_length;
|
||||
lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
|
||||
symbolTable);
|
||||
extern unsigned char builtins_bitcode_sse2_x2[];
|
||||
extern int builtins_bitcode_sse2_x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::SSE4:
|
||||
extern unsigned char builtins_bitcode_sse4[];
|
||||
extern int builtins_bitcode_sse4_length;
|
||||
extern unsigned char builtins_bitcode_sse4x2[];
|
||||
extern int builtins_bitcode_sse4x2_length;
|
||||
extern unsigned char builtins_bitcode_sse4_x2[];
|
||||
extern int builtins_bitcode_sse4_x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_sse4,
|
||||
builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_sse4_x2,
|
||||
builtins_bitcode_sse4_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
@@ -456,16 +789,106 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
case Target::AVX:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx[];
|
||||
extern int builtins_bitcode_avx_length;
|
||||
lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module,
|
||||
symbolTable);
|
||||
extern unsigned char builtins_bitcode_avx1[];
|
||||
extern int builtins_bitcode_avx1_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx1,
|
||||
builtins_bitcode_avx1_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx_x2[];
|
||||
extern int builtins_bitcode_avx_x2_length;
|
||||
lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
module, symbolTable);
|
||||
extern unsigned char builtins_bitcode_avx1_x2[];
|
||||
extern int builtins_bitcode_avx1_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx1_x2,
|
||||
builtins_bitcode_avx1_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::AVX11:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx11[];
|
||||
extern int builtins_bitcode_avx11_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx11,
|
||||
builtins_bitcode_avx11_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx11_x2[];
|
||||
extern int builtins_bitcode_avx11_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx11_x2,
|
||||
builtins_bitcode_avx11_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::AVX2:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx2[];
|
||||
extern int builtins_bitcode_avx2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx2,
|
||||
builtins_bitcode_avx2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx2_x2[];
|
||||
extern int builtins_bitcode_avx2_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx2_x2,
|
||||
builtins_bitcode_avx2_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::GENERIC:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
extern unsigned char builtins_bitcode_generic_4[];
|
||||
extern int builtins_bitcode_generic_4_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_4,
|
||||
builtins_bitcode_generic_4_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_generic_8[];
|
||||
extern int builtins_bitcode_generic_8_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_8,
|
||||
builtins_bitcode_generic_8_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_generic_16[];
|
||||
extern int builtins_bitcode_generic_16_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_16,
|
||||
builtins_bitcode_generic_16_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 32:
|
||||
extern unsigned char builtins_bitcode_generic_32[];
|
||||
extern int builtins_bitcode_generic_32_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_32,
|
||||
builtins_bitcode_generic_32_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 64:
|
||||
extern unsigned char builtins_bitcode_generic_64[];
|
||||
extern int builtins_bitcode_generic_64_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_64,
|
||||
builtins_bitcode_generic_64_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 1:
|
||||
extern unsigned char builtins_bitcode_generic_1[];
|
||||
extern int builtins_bitcode_generic_1_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_1,
|
||||
builtins_bitcode_generic_1_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
@@ -492,18 +915,29 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
symbolTable);
|
||||
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
||||
symbolTable);
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
|
||||
module, symbolTable);
|
||||
|
||||
lDefineConstantInt("__have_native_half", g->target.hasHalf, module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__have_native_rand", g->target.hasRand, module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
|
||||
module, symbolTable);
|
||||
|
||||
if (includeStdlibISPC) {
|
||||
// If the user wants the standard library to be included, parse the
|
||||
// serialized version of the stdlib.ispc file to get its
|
||||
// definitions added. Disable emission of performance warnings for
|
||||
// now, since the user doesn't care about any of that in the stdlib
|
||||
// implementation...
|
||||
bool epf = g->emitPerfWarnings;
|
||||
g->emitPerfWarnings = false;
|
||||
extern char stdlib_code[];
|
||||
yy_scan_string(stdlib_code);
|
||||
yyparse();
|
||||
g->emitPerfWarnings = epf;
|
||||
// definitions added.
|
||||
if (g->target.isa == Target::GENERIC&&g->target.vectorWidth!=1) { // 1 wide uses x86 stdlib
|
||||
extern char stdlib_generic_code[];
|
||||
yy_scan_string(stdlib_generic_code);
|
||||
yyparse();
|
||||
}
|
||||
else {
|
||||
extern char stdlib_x86_code[];
|
||||
yy_scan_string(stdlib_x86_code);
|
||||
yyparse();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,4 +55,7 @@
|
||||
void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||
bool includeStdlib);
|
||||
|
||||
void AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable = NULL);
|
||||
|
||||
#endif // ISPC_STDLIB_H
|
||||
|
||||
1979
builtins.m4
1979
builtins.m4
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -51,25 +51,47 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#include <unistd.h>
|
||||
#endif // !_MSC_VER
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef int Bool;
|
||||
|
||||
#define PRINT_SCALAR(fmt, type) \
|
||||
printf(fmt, *((type *)ptr)); \
|
||||
#define PRINT_BUF_SIZE 4096
|
||||
|
||||
#define APPEND(str) \
|
||||
do { \
|
||||
int offset = bufp - &printString[0]; \
|
||||
*bufp = '\0'; \
|
||||
strncat(bufp, str, PRINT_BUF_SIZE-offset); \
|
||||
bufp += strlen(str); \
|
||||
if (bufp >= &printString[PRINT_BUF_SIZE]) \
|
||||
goto done; \
|
||||
} while (0) /* eat semicolon */
|
||||
|
||||
|
||||
#define PRINT_SCALAR(fmt, type) \
|
||||
sprintf(tmpBuf, fmt, *((type *)ptr)); \
|
||||
APPEND(tmpBuf); \
|
||||
break
|
||||
|
||||
#define PRINT_VECTOR(fmt, type) \
|
||||
putchar('['); \
|
||||
*bufp++ = '['; \
|
||||
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
|
||||
for (int i = 0; i < width; ++i) { \
|
||||
/* only print the value if the current lane is executing */ \
|
||||
if (mask & (1<<i)) \
|
||||
printf(fmt, ((type *)ptr)[i]); \
|
||||
if (mask & (1ull<<i)) \
|
||||
sprintf(tmpBuf, fmt, ((type *)ptr)[i]); \
|
||||
else \
|
||||
printf("((" fmt "))", ((type *)ptr)[i]); \
|
||||
putchar(i != width-1 ? ',' : ']'); \
|
||||
sprintf(tmpBuf, "((" fmt "))", ((type *)ptr)[i]); \
|
||||
APPEND(tmpBuf); \
|
||||
*bufp++ = (i != width-1 ? ',' : ']'); \
|
||||
} \
|
||||
break
|
||||
|
||||
@@ -84,16 +106,18 @@ typedef int Bool;
|
||||
@param mask Current lane mask when the print statemnt is called
|
||||
@param args Array of pointers to the values to be printed
|
||||
*/
|
||||
void __do_print(const char *format, const char *types, int width, int mask,
|
||||
void __do_print(const char *format, const char *types, int width, uint64_t mask,
|
||||
void **args) {
|
||||
if (mask == 0)
|
||||
return;
|
||||
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||
char *bufp = &printString[0];
|
||||
char tmpBuf[256];
|
||||
|
||||
int argCount = 0;
|
||||
while (*format) {
|
||||
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||
// Format strings are just single percent signs.
|
||||
if (*format != '%')
|
||||
putchar(*format);
|
||||
if (*format != '%') {
|
||||
*bufp++ = *format;
|
||||
}
|
||||
else {
|
||||
if (*types) {
|
||||
void *ptr = args[argCount++];
|
||||
@@ -102,17 +126,22 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
// printf() formatting string.
|
||||
switch (*types) {
|
||||
case 'b': {
|
||||
printf("%s", *((Bool *)ptr) ? "true" : "false");
|
||||
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
break;
|
||||
}
|
||||
case 'B': {
|
||||
putchar('[');
|
||||
*bufp++ = '[';
|
||||
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||
break;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
if (mask & (1<<i))
|
||||
printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
if (mask & (1ull << i)) {
|
||||
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
}
|
||||
else
|
||||
printf("_________");
|
||||
putchar(i != width-1 ? ',' : ']');
|
||||
APPEND("_________");
|
||||
*bufp++ = (i != width-1) ? ',' : ']';
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -128,14 +157,45 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||
case 'd': PRINT_SCALAR("%f", double);
|
||||
case 'D': PRINT_VECTOR("%f", double);
|
||||
case 'p': PRINT_SCALAR("%p", void *);
|
||||
case 'P': PRINT_VECTOR("%p", void *);
|
||||
default:
|
||||
printf("UNKNOWN TYPE ");
|
||||
putchar(*types);
|
||||
APPEND("UNKNOWN TYPE ");
|
||||
*bufp++ = *types;
|
||||
}
|
||||
++types;
|
||||
}
|
||||
}
|
||||
++format;
|
||||
}
|
||||
|
||||
done:
|
||||
*bufp = '\0';
|
||||
fputs(printString, stdout);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
int __num_cores() {
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
// This is quite a hack. Including all of windows.h to get this definition
|
||||
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||
// here. Presumably this struct declaration won't be changing in the future
|
||||
// anyway...
|
||||
struct SYSTEM_INFO {
|
||||
int pad0[2];
|
||||
void *pad1[2];
|
||||
int *pad2;
|
||||
int dwNumberOfProcessors;
|
||||
int pad3[3];
|
||||
};
|
||||
|
||||
struct SYSTEM_INFO sysInfo;
|
||||
extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
|
||||
GetSystemInfo(&sysInfo);
|
||||
return sysInfo.dwNumberOfProcessors;
|
||||
#else
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
#endif // !_MSC_VER
|
||||
}
|
||||
162
builtins/dispatch.ll
Normal file
162
builtins/dispatch.ll
Normal file
@@ -0,0 +1,162 @@
|
||||
;; Copyright (c) 2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;; This file defines various functions that are used when generating the
|
||||
;; the "dispatch" object/assembly file that has entrypoints for each
|
||||
;; exported function in a module that dispatch to the best available
|
||||
;; variant of that function that will run on the system's CPU.
|
||||
|
||||
;; Stores the best target ISA that the system on which we're actually
|
||||
;; running supports. -1 represents "uninitialized", otherwise this value
|
||||
;; should correspond to one of the enumerant values of Target::ISA from
|
||||
;; ispc.h.
|
||||
|
||||
@__system_best_isa = internal global i32 -1
|
||||
|
||||
declare void @abort() noreturn
|
||||
|
||||
;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
|
||||
;; following code... Specifically, __get_system_isa should return a value
|
||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
||||
;; most capable ISA that the curremt system can run.
|
||||
;;
|
||||
;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
|
||||
;; backwards compatibility for anyone building ispc with LLVM 3.0
|
||||
;;
|
||||
;; #include <stdint.h>
|
||||
;; #include <stdlib.h>
|
||||
;;
|
||||
;; static void __cpuid(int info[4], int infoType) {
|
||||
;; __asm__ __volatile__ ("cpuid"
|
||||
;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (infoType));
|
||||
;; }
|
||||
;;
|
||||
;; /* Save %ebx in case it's the PIC register */
|
||||
;; static void __cpuid_count(int info[4], int level, int count) {
|
||||
;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; "cpuid\n\t"
|
||||
;; "xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (level), "2" (count));
|
||||
;; }
|
||||
;;
|
||||
;; int32_t __get_system_isa() {
|
||||
;; int info[4];
|
||||
;; __cpuid(info, 1);
|
||||
;;
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0) {
|
||||
;; // AVX1 for sure. Do we have AVX2?
|
||||
;; // Call cpuid with eax=7, ecx=0
|
||||
;; __cpuid_count(info, 7, 0);
|
||||
;; if ((info[1] & (1 << 5)) != 0)
|
||||
;; return 4; // AVX2
|
||||
;; else {
|
||||
;; if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
;; (info[2] & (1 << 30)) != 0) // RDRAND
|
||||
;; return 3; // AVX1 on IVB
|
||||
;; else
|
||||
;; return 2; // AVX1
|
||||
;; }
|
||||
;; }
|
||||
;; else if ((info[2] & (1 << 19)) != 0)
|
||||
;; return 1; // SSE4
|
||||
;; else if ((info[3] & (1 << 26)) != 0)
|
||||
;; return 0; // SSE2
|
||||
;; else
|
||||
;; abort();
|
||||
;; }
|
||||
|
||||
define i32 @__get_system_isa() nounwind uwtable ssp {
|
||||
entry:
|
||||
%0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
|
||||
%asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
|
||||
%and = and i32 %asmresult5.i, 268435456
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.else14, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult4.i29 = extractvalue { i32, i32, i32, i32 } %1, 1
|
||||
%and3 = and i32 %asmresult4.i29, 32
|
||||
%cmp4 = icmp eq i32 %and3, 0
|
||||
br i1 %cmp4, label %if.else, label %return
|
||||
|
||||
if.else: ; preds = %if.then
|
||||
%asmresult5.i30 = extractvalue { i32, i32, i32, i32 } %1, 2
|
||||
%2 = and i32 %asmresult5.i30, 1610612736
|
||||
%3 = icmp eq i32 %2, 1610612736
|
||||
br i1 %3, label %return, label %if.else13
|
||||
|
||||
if.else13: ; preds = %if.else
|
||||
br label %return
|
||||
|
||||
if.else14: ; preds = %entry
|
||||
%and16 = and i32 %asmresult5.i, 524288
|
||||
%cmp17 = icmp eq i32 %and16, 0
|
||||
br i1 %cmp17, label %if.else19, label %return
|
||||
|
||||
if.else19: ; preds = %if.else14
|
||||
%and21 = and i32 %asmresult6.i, 67108864
|
||||
%cmp22 = icmp eq i32 %and21, 0
|
||||
br i1 %cmp22, label %if.else24, label %return
|
||||
|
||||
if.else24: ; preds = %if.else19
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
return: ; preds = %if.else19, %if.else14, %if.else13, %if.else, %if.then
|
||||
%retval.0 = phi i32 [ 2, %if.else13 ], [ 4, %if.then ], [ 3, %if.else ], [ 1, %if.else14 ], [ 0, %if.else19 ]
|
||||
ret i32 %retval.0
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; This function is called by each of the dispatch functions we generate;
|
||||
;; it sets @__system_best_isa if it is unset.
|
||||
|
||||
define void @__set_system_isa() {
|
||||
entry:
|
||||
%bi = load i32* @__system_best_isa
|
||||
%unset = icmp eq i32 %bi, -1
|
||||
br i1 %unset, label %set_system_isa, label %done
|
||||
|
||||
set_system_isa:
|
||||
%bival = call i32 @__get_system_isa()
|
||||
store i32 %bival, i32* @__system_best_isa
|
||||
ret void
|
||||
|
||||
done:
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -30,18 +30,19 @@
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
;; AVX target implementation.
|
||||
|
||||
ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
@@ -60,7 +61,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
@@ -83,7 +84,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
@@ -92,7 +93,7 @@ define internal float @__floor_uniform_float(float) nounwind readonly alwaysinli
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
@@ -106,14 +107,14 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
@@ -122,7 +123,7 @@ define internal double @__floor_uniform_double(double) nounwind readonly alwaysi
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
@@ -137,7 +138,7 @@ define internal double @__ceil_uniform_double(double) nounwind readonly alwaysin
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
@@ -158,7 +159,7 @@ define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinli
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
@@ -170,7 +171,7 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
@@ -189,12 +190,12 @@ define internal void @__fastmath() nounwind alwaysinline {
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
@@ -206,12 +207,12 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
@@ -223,12 +224,12 @@ define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinlin
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
@@ -238,14 +239,14 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
@@ -255,7 +256,7 @@ define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
|
||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
@@ -267,12 +268,12 @@ define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,29 +29,26 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 16-wide definitions
|
||||
|
||||
stdlib_core(16)
|
||||
packed_load_and_store(16)
|
||||
scans(16)
|
||||
int64minmax(16)
|
||||
define(`WIDTH',`16')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
include(`builtins-avx-common.ll')
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
@@ -71,17 +68,17 @@ define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonl
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
round8to16(%0, 8)
|
||||
}
|
||||
|
||||
define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round8to16(%0, 9)
|
||||
}
|
||||
|
||||
define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round8to16(%0, 10)
|
||||
}
|
||||
@@ -91,15 +88,15 @@ define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readon
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 8)
|
||||
}
|
||||
|
||||
define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 9)
|
||||
}
|
||||
|
||||
define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 10)
|
||||
}
|
||||
|
||||
@@ -109,7 +106,7 @@ define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind rea
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
@@ -132,7 +129,7 @@ define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind re
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
@@ -160,52 +157,25 @@ declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
|
||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__max_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
define <16 x float> @__max_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
define internal <16 x float> @__min_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
define <16 x float> @__min_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -216,7 +186,8 @@ define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -224,7 +195,7 @@ define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
%va = shufflevector <16 x float> %0, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vb = shufflevector <16 x float> %0, <16 x float> undef,
|
||||
@@ -232,19 +203,19 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
|
||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||
%v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
|
||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v2, i32 4
|
||||
%scalar1 = extractelement <8 x float> %v3, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v3, i32 4
|
||||
%sum = fadd float %scalar1, %scalar2
|
||||
ret float %sum
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
define float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
reduce16(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
define float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
reduce16(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
@@ -253,28 +224,28 @@ reduce_equal(16)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
define internal <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||
define <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||
<16 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i32> %0, %1
|
||||
ret <16 x i32> %s
|
||||
}
|
||||
|
||||
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%s = add i32 %0, %1
|
||||
ret i32 %s
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
@@ -282,17 +253,17 @@ define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinli
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
@@ -302,7 +273,7 @@ define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinl
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
%va = shufflevector <16 x double> %0, <16 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%vb = shufflevector <16 x double> %0, <16 x double> undef,
|
||||
@@ -316,16 +287,18 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw
|
||||
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%sum = extractelement <4 x double> %sum1, i32 0
|
||||
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||
%sum = fadd double %final0, %final1
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
define double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
reduce16(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
reduce16(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
@@ -333,28 +306,28 @@ define internal double @__reduce_max_double(<16 x double>) nounwind readnone alw
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int64 ops
|
||||
|
||||
define internal <16 x i64> @__add_varying_int64(<16 x i64>,
|
||||
define <16 x i64> @__add_varying_int64(<16 x i64>,
|
||||
<16 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i64> %0, %1
|
||||
ret <16 x i64> %s
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%s = add i64 %0, %1
|
||||
ret i64 %s
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
@@ -362,17 +335,17 @@ define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinli
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
@@ -380,19 +353,14 @@ define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinl
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(16, i8, 8)
|
||||
load_and_broadcast(16, i16, 16)
|
||||
load_and_broadcast(16, i32, 32)
|
||||
load_and_broadcast(16, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
load_masked(16, i8, 8, 1)
|
||||
load_masked(16, i16, 16, 2)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %mask to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -410,7 +378,7 @@ define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
@@ -444,6 +412,7 @@ define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
ret <16 x i64> %val
|
||||
}
|
||||
|
||||
masked_load_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
@@ -451,15 +420,15 @@ define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(16, i8, 8)
|
||||
gen_masked_store(16, i16, 16)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <16 x i32> * %0 to i8 *
|
||||
%val = bitcast <16 x i32> %1 to <16 x float>
|
||||
%mask = bitcast <16 x i32> %2 to <16 x float>
|
||||
@@ -481,8 +450,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <16 x i64> * %0 to i8 *
|
||||
%val = bitcast <16 x i64> %1 to <16 x double>
|
||||
|
||||
@@ -520,58 +489,125 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;; FIXME: various code elsewhere in the builtins implementations makes
|
||||
;; calls to the 32/64 bit versions of these, basically assuming that doing
|
||||
;; so is faster than doing a full call to an actual masked store, which
|
||||
;; isn't likely to be the case on AVX. So here we provide those functions
|
||||
;; but then don't actually do what the caller asked for...
|
||||
masked_store_blend_8_16_by_16()
|
||||
|
||||
declare void @llvm.trap()
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
call void @llvm.trap()
|
||||
define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
|
||||
%oldValue = load <16 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
|
||||
%newAsFloat = bitcast <16 x i32> %1 to <16 x float>
|
||||
|
||||
%old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
|
||||
<8 x float> %new0,
|
||||
<8 x float> %mask0)
|
||||
%blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
|
||||
<8 x float> %new1,
|
||||
<8 x float> %mask1)
|
||||
%blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%blendAsInt = bitcast <16 x float> %blend to <16 x i32>
|
||||
store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
call void @llvm.trap()
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
call void @__masked_store_32(<16 x i32> * %0, <16 x i32> %1, <16 x i32> %2)
|
||||
ret void
|
||||
}
|
||||
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||
<4 x double>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
call void @__masked_store_64(<16 x i64> * %0, <16 x i64> %1, <16 x i32> %2)
|
||||
define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <16 x i64>* %ptr, align 8
|
||||
%old = bitcast <16 x i64> %oldValue to <16 x double>
|
||||
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%old1d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%old2d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%old3d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%new = bitcast <16 x i64> %newi64 to <16 x double>
|
||||
%new0d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%new1d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%new2d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%new3d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
%mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
|
||||
%mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
|
||||
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||
%mask2d = bitcast <8 x i32> %mask2 to <4 x double>
|
||||
%mask3d = bitcast <8 x i32> %mask3 to <4 x double>
|
||||
|
||||
%result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
|
||||
<4 x double> %new0d, <4 x double> %mask0d)
|
||||
%result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
|
||||
<4 x double> %new1d, <4 x double> %mask1d)
|
||||
%result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
|
||||
<4 x double> %new2d, <4 x double> %mask2d)
|
||||
%result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
|
||||
<4 x double> %new3d, <4 x double> %mask3d)
|
||||
|
||||
%result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%result = shufflevector <8 x double> %result01, <8 x double> %result23,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%result64 = bitcast <16 x double> %result to <16 x i64>
|
||||
store <16 x i64> %result64, <16 x i64> * %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
;; scatter
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
|
||||
gen_scatter(16, i8)
|
||||
gen_scatter(16, i16)
|
||||
gen_scatter(16, i32)
|
||||
gen_scatter(16, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||
|
||||
define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
|
||||
define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
|
||||
unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
@@ -583,12 +619,12 @@ define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alw
|
||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,29 +29,26 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 8-wide definitions
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
define(`WIDTH',`8')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
include(`builtins-avx-common.ll')
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
@@ -69,19 +66,19 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
|
||||
ret <8 x float> %call
|
||||
@@ -92,17 +89,17 @@ define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round4to8double(%0, 8)
|
||||
}
|
||||
|
||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
round4to8double(%0, 9)
|
||||
}
|
||||
|
||||
|
||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
round4to8double(%0, 10)
|
||||
}
|
||||
@@ -113,7 +110,7 @@ define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind reado
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
@@ -132,7 +129,7 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
@@ -160,55 +157,29 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
|
||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__max_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__max_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal <8 x float> @__min_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__min_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -216,7 +187,7 @@ define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||
@@ -226,12 +197,12 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
@@ -240,28 +211,28 @@ reduce_equal(8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
define internal <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
define <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i32> %0, %1
|
||||
ret <8 x i32> %s
|
||||
}
|
||||
|
||||
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%s = add i32 %0, %1
|
||||
ret i32 %s
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
@@ -269,17 +240,17 @@ define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinlin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
@@ -289,24 +260,26 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%sum01 = fadd <4 x double> %v0, %v1
|
||||
%red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01)
|
||||
%red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0)
|
||||
%sum = extractelement <4 x double> %red1, i32 0
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||
%sum = fadd double %final0, %final1
|
||||
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
@@ -314,28 +287,28 @@ define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwa
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int64 ops
|
||||
|
||||
define internal <8 x i64> @__add_varying_int64(<8 x i64>,
|
||||
<8 x i64>) nounwind readnone alwaysinline {
|
||||
define <8 x i64> @__add_varying_int64(<8 x i64>,
|
||||
<8 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i64> %0, %1
|
||||
ret <8 x i64> %s
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%s = add i64 %0, %1
|
||||
ret i64 %s
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
@@ -343,17 +316,17 @@ define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinlin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<8 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
@@ -361,19 +334,15 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinli
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %mask to <8 x float>
|
||||
%floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
|
||||
%retval = bitcast <8 x float> %floatval to <8 x i32>
|
||||
@@ -381,7 +350,7 @@ define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
@@ -400,22 +369,20 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
ret <8 x i64> %val
|
||||
}
|
||||
|
||||
masked_load_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <8 x i32> * %0 to i8 *
|
||||
%val = bitcast <8 x i32> %1 to <8 x float>
|
||||
%mask = bitcast <8 x i32> %2 to <8 x float>
|
||||
@@ -423,8 +390,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <8 x i64> * %0 to i8 *
|
||||
%val = bitcast <8 x i64> %1 to <8 x double>
|
||||
|
||||
@@ -448,61 +415,94 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
}
|
||||
|
||||
|
||||
;; FIXME: various code elsewhere in the builtins implementations makes
|
||||
;; calls to the 32/64 bit versions of these, basically assuming that doing
|
||||
;; so is faster than doing a full call to an actual masked store, which
|
||||
;; isn't likely to be the case on AVX. So here we provide those functions
|
||||
;; but then don't actually do what the caller asked for...
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare void @llvm.trap()
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
call void @llvm.trap()
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||
%oldValue = load <8 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||
%newAsFloat = bitcast <8 x i32> %1 to <8 x float>
|
||||
%blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
|
||||
<8 x float> %newAsFloat,
|
||||
<8 x float> %mask_as_float)
|
||||
%blendAsInt = bitcast <8 x float> %blend to <8 x i32>
|
||||
store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
call void @llvm.trap()
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
call void @__masked_store_32(<8 x i32> * %0, <8 x i32> %1, <8 x i32> %2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
call void @__masked_store_64(<8 x i64> * %0, <8 x i64> %1, <8 x i32> %2)
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
%mask = bitcast <8 x i32> %i32mask to <8 x float>
|
||||
|
||||
; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||
; are actually bitcast <4 x i64> values
|
||||
;
|
||||
; set up the first four 64-bit values
|
||||
%old01 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%old01f = bitcast <4 x i64> %old01 to <8 x float>
|
||||
%new01 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%new01f = bitcast <4 x i64> %new01 to <8 x float>
|
||||
; compute mask--note that the indices are all doubled-up
|
||||
%mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1,
|
||||
i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend them
|
||||
%result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
|
||||
<8 x float> %new01f,
|
||||
<8 x float> %mask01)
|
||||
%result01 = bitcast <8 x float> %result01f to <4 x i64>
|
||||
|
||||
; and again
|
||||
%old23 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%old23f = bitcast <4 x i64> %old23 to <8 x float>
|
||||
%new23 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%new23f = bitcast <4 x i64> %new23 to <8 x float>
|
||||
; compute mask--note that the values are doubled-up...
|
||||
%mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5,
|
||||
i32 6, i32 6, i32 7, i32 7>
|
||||
; and blend them
|
||||
%result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
|
||||
<8 x float> %new23f,
|
||||
<8 x float> %mask23)
|
||||
%result23 = bitcast <8 x float> %result23f to <4 x i64>
|
||||
|
||||
; reconstruct the final <8 x i64> vector
|
||||
%final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x i64> %final, <8 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
;; scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
@@ -514,12 +514,12 @@ define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alway
|
||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
79
builtins/target-avx1-x2.ll
Normal file
79
builtins/target-avx1-x2.ll
Normal file
@@ -0,0 +1,79 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
ifelse(NO_HALF_DECLARES, `1', `', `
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
79
builtins/target-avx1.ll
Normal file
79
builtins/target-avx1.ll
Normal file
@@ -0,0 +1,79 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
ifelse(NO_HALF_DECLARES, `1', `', `
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
89
builtins/target-avx11-x2.ll
Normal file
89
builtins/target-avx11-x2.ll
Normal file
@@ -0,0 +1,89 @@
|
||||
;; Copyright (c) 2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`NO_HALF_DECLARES', `1')
|
||||
|
||||
include(`target-avx1-x2.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
|
||||
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
|
||||
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
|
||||
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
|
||||
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
72
builtins/target-avx11.ll
Normal file
72
builtins/target-avx11.ll
Normal file
@@ -0,0 +1,72 @@
|
||||
;; Copyright (c) 2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`NO_HALF_DECLARES', `1')
|
||||
|
||||
include(`target-avx1.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
|
||||
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
129
builtins/target-avx2-x2.ll
Normal file
129
builtins/target-avx2-x2.ll
Normal file
@@ -0,0 +1,129 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
|
||||
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
|
||||
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
|
||||
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
|
||||
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
112
builtins/target-avx2.ll
Normal file
112
builtins/target-avx2.ll
Normal file
@@ -0,0 +1,112 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
|
||||
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
939
builtins/target-generic-1.ll
Executable file
939
builtins/target-generic-1.ll
Executable file
@@ -0,0 +1,939 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Define the standard library builtins for the NOVEC target
|
||||
define(`MASK',`i32')
|
||||
define(`WIDTH',`1')
|
||||
include(`util.m4')
|
||||
; Define some basics for a 1-wide target
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
aossoa()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
|
||||
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %mv = trunc <1 x i32> %mask to <1 x i8>
|
||||
; %notmask = xor <1 x i8> %mv, <i8 -1>
|
||||
; %cleared_old = and <1 x i8> %0, %notmask
|
||||
; %masked_new = and <1 x i8> %1, %mv
|
||||
; %new = or <1 x i8> %cleared_old, %masked_new
|
||||
; ret <1 x i8> %new
|
||||
|
||||
; not doing this the easy way because of problems with LLVM's scalarizer
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i8> %0, i32 0
|
||||
%d1 = extractelement <1 x i8> %1, i32 0
|
||||
%sel = select i1 %cmp, i8 %d0, i8 %d1
|
||||
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
|
||||
ret <1 x i8> %r
|
||||
}
|
||||
|
||||
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %mv = trunc <1 x i32> %mask to <1 x i16>
|
||||
; %notmask = xor <1 x i16> %mv, <i16 -1>
|
||||
; %cleared_old = and <1 x i16> %0, %notmask
|
||||
; %masked_new = and <1 x i16> %1, %mv
|
||||
; %new = or <1 x i16> %cleared_old, %masked_new
|
||||
; ret <1 x i16> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i16> %0, i32 0
|
||||
%d1 = extractelement <1 x i16> %1, i32 0
|
||||
%sel = select i1 %cmp, i16 %d0, i16 %d1
|
||||
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
|
||||
ret <1 x i16> %r
|
||||
|
||||
; ret <1 x i16> %sel
|
||||
}
|
||||
|
||||
|
||||
define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %notmask = xor <1 x i32> %mask, <i32 -1>
|
||||
; %cleared_old = and <1 x i32> %0, %notmask
|
||||
; %masked_new = and <1 x i32> %1, %mask
|
||||
; %new = or <1 x i32> %cleared_old, %masked_new
|
||||
; ret <1 x i32> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
|
||||
; ret <1 x i32> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i32> %0, i32 0
|
||||
%d1 = extractelement <1 x i32> %1, i32 0
|
||||
%sel = select i1 %cmp, i32 %d0, i32 %d1
|
||||
%r = insertelement <1 x i32> undef, i32 %sel, i32 0
|
||||
ret <1 x i32> %r
|
||||
|
||||
}
|
||||
|
||||
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %newmask = zext <1 x i32> %mask to <1 x i64>
|
||||
; %notmask = xor <1 x i64> %newmask, <i64 -1>
|
||||
; %cleared_old = and <1 x i64> %0, %notmask
|
||||
; %masked_new = and <1 x i64> %1, %newmask
|
||||
; %new = or <1 x i64> %cleared_old, %masked_new
|
||||
; ret <1 x i64> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
|
||||
; ret <1 x i64> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i64> %0, i32 0
|
||||
%d1 = extractelement <1 x i64> %1, i32 0
|
||||
%sel = select i1 %cmp, i64 %d0, i64 %d1
|
||||
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
|
||||
ret <1 x i64> %r
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__vselect_float(<1 x float>, <1 x float>,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %v0 = bitcast <1 x float> %0 to <1 x i32>
|
||||
; %v1 = bitcast <1 x float> %1 to <1 x i32>
|
||||
; %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
|
||||
; %rf = bitcast <1 x i32> %r to <1 x float>
|
||||
; ret <1 x float> %rf
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
|
||||
; ret <1 x float> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x float> %0, i32 0
|
||||
%d1 = extractelement <1 x float> %1, i32 0
|
||||
%sel = select i1 %cmp, float %d0, float %d1
|
||||
%r = insertelement <1 x float> undef, float %sel, i32 0
|
||||
ret <1 x float> %r
|
||||
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i8> * %0, align 4
|
||||
%newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask)
|
||||
store <1 x i8> %newval, <1 x i8> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i16> * %0, align 4
|
||||
%newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask)
|
||||
store <1 x i16> %newval, <1 x i16> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i32> * %0, align 4
|
||||
%newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask)
|
||||
store <1 x i32> %newval, <1 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i64> * %0, align 4
|
||||
%newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask)
|
||||
store <1 x i64> %newval, <1 x i64> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
|
||||
%bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
|
||||
%bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
|
||||
%binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
|
||||
%bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
|
||||
ret <1 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
|
||||
%bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <1 x float> %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
|
||||
%bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <1 x float> %binop.i
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
; expecting math lib to provide this
|
||||
declare double @ceil (double) nounwind readnone
|
||||
declare double @floor (double) nounwind readnone
|
||||
declare double @round (double) nounwind readnone
|
||||
;declare float @llvm.sqrt.f32(float %Val)
|
||||
declare double @llvm.sqrt.f64(double %Val)
|
||||
declare float @llvm.sin.f32(float %Val)
|
||||
declare float @llvm.cos.f32(float %Val)
|
||||
declare float @llvm.sqrt.f32(float %Val)
|
||||
declare float @llvm.exp.f32(float %Val)
|
||||
declare float @llvm.log.f32(float %Val)
|
||||
declare float @llvm.pow.f32(float %f, float %e)
|
||||
|
||||
|
||||
|
||||
|
||||
;; stuff that could be in builtins ...
|
||||
|
||||
define(`unary1to1', `
|
||||
%v_0 = extractelement <1 x $1> %0, i32 0
|
||||
%r_0 = call $1 $2($1 %v_0)
|
||||
%ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
|
||||
ret <1 x $1> %ret_0
|
||||
')
|
||||
|
||||
|
||||
|
||||
;; dummy 1 wide vector ops
|
||||
define void
|
||||
@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
|
||||
<1 x float> %v3, <1 x float> * noalias %out0,
|
||||
<1 x float> * noalias %out1, <1 x float> * noalias %out2,
|
||||
<1 x float> * noalias %out3) nounwind alwaysinline {
|
||||
|
||||
store <1 x float> %v0, <1 x float > * %out0
|
||||
store <1 x float> %v1, <1 x float > * %out1
|
||||
store <1 x float> %v2, <1 x float > * %out2
|
||||
store <1 x float> %v3, <1 x float > * %out3
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
|
||||
<1 x float> %v3, <1 x float> * noalias %out0,
|
||||
<1 x float> * noalias %out1, <1 x float> * noalias %out2,
|
||||
<1 x float> * noalias %out3) nounwind alwaysinline {
|
||||
call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> %v3, <1 x float> * %out0,
|
||||
<1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2) {
|
||||
store <1 x float> %v0, <1 x float > * %out0
|
||||
store <1 x float> %v1, <1 x float > * %out1
|
||||
store <1 x float> %v2, <1 x float > * %out2
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2) {
|
||||
call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;; end builtins
|
||||
|
||||
|
||||
define <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @round)
|
||||
}
|
||||
|
||||
define <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @floor)
|
||||
}
|
||||
|
||||
|
||||
define <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @ceil)
|
||||
}
|
||||
|
||||
; To do vector integer min and max, we do the vector compare and then sign
|
||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||
; rest...
|
||||
|
||||
define <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp slt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp slt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; The functions for unsigned ints are similar, just with unsigned
|
||||
; comparison functions...
|
||||
|
||||
define <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ult <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ult i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
|
||||
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
|
||||
%r = extractelement <1 x float> %v, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<1 x float>) nounwind readnone {
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<1 x float>) nounwind readnone {
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %v, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<1 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x i32> %vv, i32 0
|
||||
store i32 %v, i32 * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x float> %vv, i32 0
|
||||
store float %v, float * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x i64> %vv, i32 0
|
||||
store i64 %v, i64 * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x double> %vv, i32 0
|
||||
store double %v, double * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
; extracting/reinserting elements because I want to be able to remove vectors later on
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
define <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
;%v_iv = fmul <1 x float> %0, %call
|
||||
;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
;%iv_mul = fmul <1 x float> %call, %two_minus
|
||||
;ret <1 x float> %iv_mul
|
||||
%d = extractelement <1 x float> %0, i32 0
|
||||
%r = fdiv float 1.,%d
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
define <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
|
||||
;ret <1 x float> %call
|
||||
%d = extractelement <1 x float> %0, i32 0
|
||||
%r = call float @llvm.sqrt.f32(float %d)
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
;%v_is = fmul <1 x float> %v, %is
|
||||
;%v_is_is = fmul <1 x float> %v_is, %is
|
||||
;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
;%is_mul = fmul <1 x float> %is, %three_sub
|
||||
;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
;ret <1 x float> %half_scale
|
||||
%s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
|
||||
%r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
|
||||
ret <1 x float> %r
|
||||
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.sin.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float,@llvm.sin.f32)
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.cos.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float, @llvm.cos.f32)
|
||||
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
|
||||
; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
|
||||
; store <1 x float> %s, <1 x float> * %1
|
||||
; ret void
|
||||
%sin = call <1 x float> @__svml_sin (<1 x float> %0)
|
||||
%cos = call <1 x float> @__svml_cos (<1 x float> %0)
|
||||
store <1 x float> %sin, <1 x float> * %1
|
||||
store <1 x float> %cos, <1 x float> * %2
|
||||
ret void
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm_tan_f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
;unasry1to1(float, @llvm.tan.f32)
|
||||
; UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
|
||||
; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
|
||||
; ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm_atan_f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
;unsary1to1(float,@llvm.atan.f32)
|
||||
;UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
;%y = extractelement <1 x float> %0, i32 0
|
||||
;%x = extractelement <1 x float> %1, i32 0
|
||||
;%q = fdiv float %y, %x
|
||||
;%a = call float @llvm.atan.f32 (float %q)
|
||||
;%rv = insertelement <1 x float> undef, float %a, i32 0
|
||||
;ret <1 x float> %rv
|
||||
; UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.exp.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.log.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
%e = extractelement <1 x float> %1, i32 0
|
||||
%s = call float @llvm.pow.f32(float %r,float %e)
|
||||
%rv = insertelement <1 x float> undef, float %s, i32 0
|
||||
ret <1 x float> %rv
|
||||
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
define <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
|
||||
; %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
|
||||
; ret <1 x float> %call
|
||||
%a = extractelement <1 x float> %0, i32 0
|
||||
%b = extractelement <1 x float> %1, i32 0
|
||||
%d = fcmp ogt float %a, %b
|
||||
%r = select i1 %d, float %a, float %b
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
define <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
|
||||
; %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
|
||||
; ret <1 x float> %call
|
||||
%a = extractelement <1 x float> %0, i32 0
|
||||
%b = extractelement <1 x float> %1, i32 0
|
||||
%d = fcmp olt float %a, %b
|
||||
%r = select i1 %d, float %a, float %b
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
|
||||
;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
;ret <1 x double> %ret
|
||||
unary1to1(double, @llvm.sqrt.f64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
|
||||
;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
;ret <1 x double> %ret
|
||||
%a = extractelement <1 x double> %0, i32 0
|
||||
%b = extractelement <1 x double> %1, i32 0
|
||||
%d = fcmp olt double %a, %b
|
||||
%r = select i1 %d, double %a, double %b
|
||||
%rv = insertelement <1 x double> undef, double %r, i32 0
|
||||
ret <1 x double> %rv
|
||||
|
||||
}
|
||||
|
||||
define <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
|
||||
;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
;ret <1 x double> %ret
|
||||
%a = extractelement <1 x double> %0, i32 0
|
||||
%b = extractelement <1 x double> %1, i32 0
|
||||
%d = fcmp ogt double %a, %b
|
||||
%r = select i1 %d, double %a, double %b
|
||||
%rv = insertelement <1 x double> undef, double %r, i32 0
|
||||
ret <1 x double> %rv
|
||||
|
||||
}
|
||||
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%r = fdiv float 1.,%0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__round_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs=call double @round(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs = call double @floor(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs = call double @ceil(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%ret = call float @llvm.sqrt.f32(float %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%ret = call double @llvm.sqrt.f64(double %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%s = call float @__sqrt_uniform_float(float %0)
|
||||
%r = call float @__rcp_uniform_float(float %s)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fastmath
|
||||
|
||||
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
; no-op
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
%d = fcmp ogt float %0, %1
|
||||
%r = select i1 %d, float %0, float %1
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
%d = fcmp olt float %0, %1
|
||||
%r = select i1 %d, float %0, float %1
|
||||
ret float %r
|
||||
|
||||
}
|
||||
define double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
|
||||
%d = fcmp ogt double %0, %1
|
||||
%r = select i1 %d, double %0, double %1
|
||||
ret double %r
|
||||
|
||||
}
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
|
||||
%d = fcmp olt double %0, %1
|
||||
%r = select i1 %d, double %0, double %1
|
||||
ret double %r
|
||||
|
||||
}
|
||||
|
||||
define_shuffles()
|
||||
|
||||
ctlztz()
|
||||
|
||||
define_prefetches()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
34
builtins/target-generic-16.ll
Normal file
34
builtins/target-generic-16.ll
Normal file
@@ -0,0 +1,34 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`16')
|
||||
include(`target-generic-common.ll')
|
||||
|
||||
33
builtins/target-generic-32.ll
Normal file
33
builtins/target-generic-32.ll
Normal file
@@ -0,0 +1,33 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`32')
|
||||
include(`target-generic-common.ll')
|
||||
34
builtins/target-generic-4.ll
Normal file
34
builtins/target-generic-4.ll
Normal file
@@ -0,0 +1,34 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`4')
|
||||
include(`target-generic-common.ll')
|
||||
|
||||
33
builtins/target-generic-64.ll
Normal file
33
builtins/target-generic-64.ll
Normal file
@@ -0,0 +1,33 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`64')
|
||||
include(`target-generic-common.ll')
|
||||
34
builtins/target-generic-8.ll
Normal file
34
builtins/target-generic-8.ll
Normal file
@@ -0,0 +1,34 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`WIDTH',`8')
|
||||
include(`target-generic-common.ll')
|
||||
|
||||
360
builtins/target-generic-common.ll
Normal file
360
builtins/target-generic-common.ll
Normal file
@@ -0,0 +1,360 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`MASK',`i1')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
scans()
|
||||
reduce_equal(WIDTH)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; broadcast/rotate/shuffle
|
||||
|
||||
declare <WIDTH x float> @__smear_float(<WIDTH x float>, float) nounwind readnone
|
||||
declare <WIDTH x double> @__smear_double(<WIDTH x double>, double) nounwind readnone
|
||||
declare <WIDTH x i8> @__smear_i8(<WIDTH x i8>, i8) nounwind readnone
|
||||
declare <WIDTH x i16> @__smear_i16(<WIDTH x i16>, i16) nounwind readnone
|
||||
declare <WIDTH x i32> @__smear_i32(<WIDTH x i32>, i32) nounwind readnone
|
||||
declare <WIDTH x i64> @__smear_i64(<WIDTH x i64>, i64) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
|
||||
declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
|
||||
declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
|
||||
declare <WIDTH x i16> @__broadcast_i16(<WIDTH x i16>, i32) nounwind readnone
|
||||
declare <WIDTH x i32> @__broadcast_i32(<WIDTH x i32>, i32) nounwind readnone
|
||||
declare <WIDTH x i64> @__broadcast_i64(<WIDTH x i64>, i32) nounwind readnone
|
||||
|
||||
declare <WIDTH x i8> @__rotate_i8(<WIDTH x i8>, i32) nounwind readnone
|
||||
declare <WIDTH x i16> @__rotate_i16(<WIDTH x i16>, i32) nounwind readnone
|
||||
declare <WIDTH x float> @__rotate_float(<WIDTH x float>, i32) nounwind readnone
|
||||
declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
|
||||
declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
|
||||
declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
|
||||
|
||||
declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i16> @__shuffle_i16(<WIDTH x i16>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i16> @__shuffle2_i16(<WIDTH x i16>, <WIDTH x i16>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x float> @__shuffle_float(<WIDTH x float>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x float> @__shuffle2_float(<WIDTH x float>, <WIDTH x float>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__shuffle_i32(<WIDTH x i32>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__shuffle2_i32(<WIDTH x i32>, <WIDTH x i32>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x double> @__shuffle_double(<WIDTH x double>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x double> @__shuffle2_double(<WIDTH x double>,
|
||||
<WIDTH x double>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i64> @__shuffle_i64(<WIDTH x i64>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i64> @__shuffle2_i64(<WIDTH x i64>, <WIDTH x i64>,
|
||||
<WIDTH x i32>) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; aos/soa
|
||||
|
||||
declare void @__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
|
||||
<WIDTH x float> %v2, float * noalias %p) nounwind
|
||||
declare void @__aos_to_soa3_float(float * noalias %p, <WIDTH x float> * %out0,
|
||||
<WIDTH x float> * %out1, <WIDTH x float> * %out2) nounwind
|
||||
declare void @__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
|
||||
<WIDTH x float> %v2, <WIDTH x float> %v3,
|
||||
float * noalias %p) nounwind
|
||||
declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias %out0,
|
||||
<WIDTH x float> * noalias %out1,
|
||||
<WIDTH x float> * noalias %out2,
|
||||
<WIDTH x float> * noalias %out3) nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
|
||||
declare void @__fastmath() nounwind
|
||||
|
||||
;; round/floor/ceil
|
||||
|
||||
declare float @__round_uniform_float(float) nounwind readnone
|
||||
declare float @__floor_uniform_float(float) nounwind readnone
|
||||
declare float @__ceil_uniform_float(float) nounwind readnone
|
||||
|
||||
declare double @__round_uniform_double(double) nounwind readnone
|
||||
declare double @__floor_uniform_double(double) nounwind readnone
|
||||
declare double @__ceil_uniform_double(double) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
;; min/max
|
||||
|
||||
declare float @__max_uniform_float(float, float) nounwind readnone
|
||||
declare float @__min_uniform_float(float, float) nounwind readnone
|
||||
declare i32 @__min_uniform_int32(i32, i32) nounwind readnone
|
||||
declare i32 @__max_uniform_int32(i32, i32) nounwind readnone
|
||||
declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone
|
||||
declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone
|
||||
declare i64 @__min_uniform_int64(i64, i64) nounwind readnone
|
||||
declare i64 @__max_uniform_int64(i64, i64) nounwind readnone
|
||||
declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone
|
||||
declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone
|
||||
declare double @__min_uniform_double(double, double) nounwind readnone
|
||||
declare double @__max_uniform_double(double, double) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
|
||||
<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
|
||||
<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone
|
||||
declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone
|
||||
declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
|
||||
<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
|
||||
<WIDTH x double>) nounwind readnone
|
||||
|
||||
;; sqrt/rsqrt/rcp
|
||||
|
||||
declare float @__rsqrt_uniform_float(float) nounwind readnone
|
||||
declare float @__rcp_uniform_float(float) nounwind readnone
|
||||
declare float @__sqrt_uniform_float(float) nounwind readnone
|
||||
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
|
||||
declare double @__sqrt_uniform_double(double) nounwind readnone
|
||||
declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
;; bit ops
|
||||
|
||||
declare i32 @__popcnt_int32(i32) nounwind readnone
|
||||
declare i64 @__popcnt_int64(i64) nounwind readnone
|
||||
|
||||
declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
|
||||
declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
|
||||
declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
|
||||
declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
|
||||
|
||||
;; svml
|
||||
|
||||
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||
; or, use the macro to call the 4-wide ones twice with our 8-wide
|
||||
; vectors...
|
||||
|
||||
declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
|
||||
declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
|
||||
declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_log(<WIDTH x float>)
|
||||
declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reductions
|
||||
|
||||
declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone
|
||||
|
||||
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone
|
||||
|
||||
declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
|
||||
|
||||
declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
|
||||
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
|
||||
|
||||
declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone
|
||||
declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone
|
||||
declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
|
||||
|
||||
declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
|
||||
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
|
||||
declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
', `
|
||||
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i8> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
|
||||
store <WIDTH x i8> %v1, <WIDTH x i8> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i16> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
|
||||
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i32> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
|
||||
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x float> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
|
||||
store <WIDTH x float> %v1, <WIDTH x float> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
|
||||
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i64> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
|
||||
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
|
||||
<WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x double> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
|
||||
store <WIDTH x double> %v1, <WIDTH x double> * %0
|
||||
ret void
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
define(`gather_scatter', `
|
||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
|
||||
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
|
||||
<WIDTH x i1>) nounwind
|
||||
')
|
||||
|
||||
gather_scatter(i8)
|
||||
gather_scatter(i16)
|
||||
gather_scatter(i32)
|
||||
gather_scatter(float)
|
||||
gather_scatter(i64)
|
||||
gather_scatter(double)
|
||||
|
||||
declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
|
||||
<WIDTH x i1>) nounwind
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; prefetch
|
||||
|
||||
declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
||||
|
||||
271
builtins/target-sse2-common.ll
Normal file
271
builtins/target-sse2-common.ll
Normal file
@@ -0,0 +1,271 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; do the rcpss call
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration to improve precision, as above
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math mode
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @__max_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
|
||||
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
|
||||
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
|
||||
%binop21.i = fadd float %binop.i, -8.388608e+06
|
||||
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
|
||||
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
|
||||
ret float %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp ogt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, -1082130432
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp olt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, 1065353216
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare double @round(double)
|
||||
declare double @floor(double)
|
||||
declare double @ceil(double)
|
||||
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @round(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @floor(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @ceil(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32)
|
||||
declare i64 @llvm.ctpop.i64(i64)
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%val = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
%val = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %val
|
||||
}
|
||||
|
||||
|
||||
648
builtins/target-sse2-x2.ll
Normal file
648
builtins/target-sse2-x2.ll
Normal file
@@ -0,0 +1,648 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
;; This file defines the target for "double-pumped" SSE2, i.e. running
|
||||
;; with 8-wide vectors
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; standard 8-wide definitions from m4 macros
|
||||
|
||||
define(`WIDTH',`8')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul <8 x float> %0, %call
|
||||
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <8 x float> %call, %two_minus
|
||||
ret <8 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <8 x float> %v, %is
|
||||
%v_is_is = fmul <8 x float> %v_is, %is
|
||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <8 x float> %is, %three_sub
|
||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <8 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_sinf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_cosf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
<8 x float> *) nounwind readnone alwaysinline {
|
||||
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%cospa = alloca <4 x float>
|
||||
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
|
||||
|
||||
%cospb = alloca <4 x float>
|
||||
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
|
||||
|
||||
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x float> %sin, <8 x float> * %1
|
||||
|
||||
%cosa = load <4 x float> * %cospa
|
||||
%cosb = load <4 x float> * %cospb
|
||||
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x float> %cos, <8 x float> * %2
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_tanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_atanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan2(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_expf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_logf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_pow(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
; There is no blend instruction with SSE2, so we simulate it with bit
|
||||
; operations on i32s. For these two vselect functions, for each
|
||||
; vector element, if the mask is on, we return the corresponding value
|
||||
; from %1, and otherwise return the value from %0.
|
||||
|
||||
define <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> ,
|
||||
<8 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
||||
%cleared_old = and <8 x i32> %0, %notmask
|
||||
%masked_new = and <8 x i32> %1, %mask
|
||||
%new = or <8 x i32> %cleared_old, %masked_new
|
||||
ret <8 x i32> %new
|
||||
}
|
||||
|
||||
define <8 x float> @__vselect_float(<8 x float>, <8 x float>,
|
||||
<8 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%v0 = bitcast <8 x float> %0 to <8 x i32>
|
||||
%v1 = bitcast <8 x float> %1 to <8 x i32>
|
||||
%r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask)
|
||||
%rf = bitcast <8 x i32> %r to <8 x float>
|
||||
ret <8 x float> %rf
|
||||
}
|
||||
|
||||
|
||||
; To do vector integer min and max, we do the vector compare and then sign
|
||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||
; rest...
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp slt <8 x i32> %0, %1
|
||||
%mask = sext <8 x i1> %c to <8 x i32>
|
||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp slt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt <8 x i32> %0, %1
|
||||
%mask = sext <8 x i1> %c to <8 x i32>
|
||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; The functions for unsigned ints are similar, just with unsigned
|
||||
; comparison functions...
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ult <8 x i32> %0, %1
|
||||
%mask = sext <8 x i1> %c to <8 x i32>
|
||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ult i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt <8 x i32> %0, %1
|
||||
%mask = sext <8 x i1> %c to <8 x i32>
|
||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define <4 x float> @__vec4_add_float(<4 x float> %v0,
|
||||
<4 x float> %v1) nounwind readnone alwaysinline {
|
||||
%v = fadd <4 x float> %v0, %v1
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
define float @__add_float(float, float) nounwind readnone alwaysinline {
|
||||
%v = fadd float %0, %1
|
||||
ret float %v
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8by4(float, @__vec4_add_float, @__add_float)
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
%v = add <4 x i32> %v0, %v1
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%v = add i32 %0, %1
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @__vec4_add_int32, @__add_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
define <4 x double> @__add_varying_double(<4 x double>,
|
||||
<4 x double>) nounwind readnone alwaysinline {
|
||||
%r = fadd <4 x double> %0, %1
|
||||
ret <4 x double> %r
|
||||
}
|
||||
|
||||
define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
%r = fadd double %0, %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%r = add i64 %0, %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float rounding
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
|
||||
%bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||
%bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
|
||||
%binop21.i = fadd <8 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
|
||||
%bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
|
||||
ret <8 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
|
||||
%bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <8 x float> %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
|
||||
%bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <8 x float> %binop.i
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
unary1to8(double, @round)
|
||||
}
|
||||
|
||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
unary1to8(double, @floor)
|
||||
}
|
||||
|
||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
unary1to8(double, @ceil)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <8 x i32> * %0, align 4
|
||||
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
|
||||
store <8 x i32> %newval, <8 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
|
||||
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||
; are actually bitcast <2 x i64> values
|
||||
;
|
||||
; set up the first two 64-bit values
|
||||
%old0123 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%old0123f = bitcast <4 x i64> %old0123 to <8 x float>
|
||||
%new0123 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%new0123f = bitcast <4 x i64> %new0123 to <8 x float>
|
||||
; compute mask--note that the indices are doubled-up
|
||||
%mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend the first 4 values
|
||||
%result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f,
|
||||
<8 x i32> %mask0123)
|
||||
%result0123 = bitcast <8 x float> %result0123f to <4 x i64>
|
||||
|
||||
; and again
|
||||
%old4567 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%old4567f = bitcast <4 x i64> %old4567 to <8 x float>
|
||||
%new4567 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%new4567f = bitcast <4 x i64> %new4567 to <8 x float>
|
||||
; compute mask--note that the values are doubled-up
|
||||
%mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
; and blend the two of the values
|
||||
%result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f,
|
||||
<8 x i32> %mask4567)
|
||||
%result4567 = bitcast <8 x float> %result4567f to <4 x i64>
|
||||
|
||||
; reconstruct the final <8 x i64> vector
|
||||
%final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x i64> %final, <8 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision float min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
590
builtins/target-sse2.ll
Normal file
590
builtins/target-sse2.ll
Normal file
@@ -0,0 +1,590 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Define the standard library builtins for the SSE2 target
|
||||
|
||||
; Define some basics for a 4-wide target
|
||||
define(`WIDTH',`4')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
|
||||
%bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||
%bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
|
||||
%binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
|
||||
%binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
|
||||
%bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
|
||||
ret <4 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
||||
%bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
|
||||
%binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <4 x float> %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
||||
%bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
|
||||
%binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <4 x float> %binop.i
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @round)
|
||||
}
|
||||
|
||||
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @floor)
|
||||
}
|
||||
|
||||
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @ceil)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
; There is no blend instruction with SSE2, so we simulate it with bit
|
||||
; operations on i32s. For these two vselect functions, for each
|
||||
; vector element, if the mask is on, we return the corresponding value
|
||||
; from %1, and otherwise return the value from %0.
|
||||
|
||||
define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
|
||||
%cleared_old = and <4 x i32> %0, %notmask
|
||||
%masked_new = and <4 x i32> %1, %mask
|
||||
%new = or <4 x i32> %cleared_old, %masked_new
|
||||
ret <4 x i32> %new
|
||||
}
|
||||
|
||||
define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%v0 = bitcast <4 x float> %0 to <4 x i32>
|
||||
%v1 = bitcast <4 x float> %1 to <4 x i32>
|
||||
%r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
|
||||
%rf = bitcast <4 x i32> %r to <4 x float>
|
||||
ret <4 x float> %rf
|
||||
}
|
||||
|
||||
|
||||
; To do vector integer min and max, we do the vector compare and then sign
|
||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||
; rest...
|
||||
|
||||
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp slt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp slt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; The functions for unsigned ints are similar, just with unsigned
|
||||
; comparison functions...
|
||||
|
||||
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ult <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ult i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = fadd <4 x float> %v1, %v
|
||||
%m1a = extractelement <4 x float> %m1, i32 0
|
||||
%m1b = extractelement <4 x float> %m1, i32 1
|
||||
%sum = fadd float %m1a, %m1b
|
||||
ret float %sum
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = add <4 x i32> %v1, %v
|
||||
%m1a = extractelement <4 x i32> %m1, i32 0
|
||||
%m1b = extractelement <4 x i32> %m1, i32 1
|
||||
%sum = add i32 %m1a, %m1b
|
||||
ret i32 %sum
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = fadd <2 x double> %v0, %v1
|
||||
%e0 = extractelement <2 x double> %sum, i32 0
|
||||
%e1 = extractelement <2 x double> %sum, i32 1
|
||||
%m = fadd double %e0, %e1
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = add <2 x i64> %v0, %v1
|
||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||
%m = add i64 %e0, %e1
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(4)
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <4 x i32> * %0, align 4
|
||||
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
|
||||
store <4 x i32> %newval, <4 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
|
||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||
; are actually bitcast <2 x i64> values
|
||||
;
|
||||
; set up the first two 64-bit values
|
||||
%old01 = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%old01f = bitcast <2 x i64> %old01 to <4 x float>
|
||||
%new01 = shufflevector <4 x i64> %new, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%new01f = bitcast <2 x i64> %new01 to <4 x float>
|
||||
; compute mask--note that the indices 0 and 1 are doubled-up
|
||||
%mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
|
||||
<4 x i32> <i32 0, i32 0, i32 1, i32 1>
|
||||
; and blend the two of the values
|
||||
%result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
|
||||
%result01 = bitcast <4 x float> %result01f to <2 x i64>
|
||||
|
||||
; and again
|
||||
%old23 = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%old23f = bitcast <2 x i64> %old23 to <4 x float>
|
||||
%new23 = shufflevector <4 x i64> %new, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%new23f = bitcast <2 x i64> %new23 to <4 x float>
|
||||
; compute mask--note that the values 2 and 3 are doubled-up
|
||||
%mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
|
||||
<4 x i32> <i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend the two of the values
|
||||
%result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
|
||||
%result23 = bitcast <4 x float> %result23f to <2 x i64>
|
||||
|
||||
; reconstruct the final <4 x i64> vector
|
||||
%final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
store <4 x i64> %final, <4 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <4 x float> %0, %call
|
||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
||||
ret <4 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <4 x float> %v, %is
|
||||
%v_is_is = fmul <4 x float> %v_is, %is
|
||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <4 x float> %is, %three_sub
|
||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <4 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||
store <4 x float> %s, <4 x float> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
276
builtins/target-sse4-common.ll
Normal file
276
builtins/target-sse4-common.ll
Normal file
@@ -0,0 +1,276 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both. Further, only the 0th
|
||||
; element of the b parameter matters
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; do the rcpss call
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration to improve precision, as above
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math mode
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
define double @__max_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -36,18 +36,31 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; standard 8-wide definitions from m4 macros
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
define(`WIDTH',`8')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
@@ -60,27 +73,12 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
|
||||
ret <8 x float> %iv_mul
|
||||
}
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
@@ -94,56 +92,16 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
|
||||
ret <8 x float> %half_scale
|
||||
}
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
@@ -158,17 +116,17 @@ declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_sinf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_cosf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define internal void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
define void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
<8 x float> *) nounwind readnone alwaysinline {
|
||||
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
@@ -197,33 +155,33 @@ define internal void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_tanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_atanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__svml_atan2(<8 x float>,
|
||||
define <8 x float> @__svml_atan2(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_expf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_logf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__svml_pow(<8 x float>,
|
||||
define <8 x float> @__svml_pow(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
@@ -234,91 +192,52 @@ define internal <8 x float> @__svml_pow(<8 x float>,
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
@@ -332,106 +251,107 @@ define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
ret i32 %v
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
|
||||
}
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
%v = add <4 x i32> %v0, %v1
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%v = add i32 %0, %1
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @__vec4_add_int32, @__add_int32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
define internal <4 x double> @__add_varying_double(<4 x double>,
|
||||
define <4 x double> @__add_varying_double(<4 x double>,
|
||||
<4 x double>) nounwind readnone alwaysinline {
|
||||
%r = fadd <4 x double> %0, %1
|
||||
ret <4 x double> %r
|
||||
}
|
||||
|
||||
define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
%r = fadd double %0, %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||
define double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||
define double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define internal <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%r = add i64 %0, %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
@@ -440,156 +360,76 @@ reduce_equal(8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
load_masked(8, i32, 32, 4)
|
||||
load_masked(8, i64, 64, 8)
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float rounding
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
round4to8(%0, 8)
|
||||
}
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round4to8(%0, 9)
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round4to8(%0, 10)
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round2to8double(%0, 8)
|
||||
}
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round2to8double(%0, 9)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round2to8double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
@@ -606,18 +446,18 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(8, i32, 32)
|
||||
gen_masked_store(8, i64, 64)
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
; do two 4-wide blends with blendvps
|
||||
%mask_as_float = bitcast <8 x i32> %mask to <8 x float>
|
||||
%mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
|
||||
@@ -646,8 +486,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
; implement this as 4 blends of <4 x i32>s, which are actually bitcast
|
||||
; <2 x i64>s...
|
||||
|
||||
@@ -713,49 +553,30 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision float min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret double %ret
|
||||
|
||||
}
|
||||
489
builtins/target-sse4.ll
Normal file
489
builtins/target-sse4.ll
Normal file
@@ -0,0 +1,489 @@
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; Define common 4-wide stuff
|
||||
define(`WIDTH',`4')
|
||||
define(`MASK',`i32')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <4 x float> %0, %call
|
||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
||||
ret <4 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <4 x float> %v, %is
|
||||
%v_is_is = fmul <4 x float> %v_is, %is
|
||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <4 x float> %is, %three_sub
|
||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <4 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
|
||||
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
round2to4double(%0, 8)
|
||||
}
|
||||
|
||||
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round2to4double(%0, 9)
|
||||
}
|
||||
|
||||
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round2to4double(%0, 10)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||
store <4 x float> %s, <4 x float> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
|
||||
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
|
||||
%scalar = extractelement <4 x float> %v2, i32 0
|
||||
ret float %scalar
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = add <4 x i32> %v1, %v
|
||||
%m1a = extractelement <4 x i32> %m1, i32 0
|
||||
%m1b = extractelement <4 x i32> %m1, i32 1
|
||||
%sum = add i32 %m1a, %m1b
|
||||
ret i32 %sum
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = fadd <2 x double> %v0, %v1
|
||||
%e0 = extractelement <2 x double> %sum, i32 0
|
||||
%e1 = extractelement <2 x double> %sum, i32 1
|
||||
%m = fadd double %e0, %e1
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = add <2 x i64> %v0, %v1
|
||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||
%m = add i64 %e0, %e1
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
||||
%oldValue = load <4 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
|
||||
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
|
||||
<4 x float> %newAsFloat,
|
||||
<4 x float> %mask_as_float)
|
||||
%blendAsInt = bitcast <4 x float> %blend to <4 x i32>
|
||||
store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
||||
|
||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||
; are actually bitcast <2 x i64> values
|
||||
;
|
||||
; set up the first two 64-bit values
|
||||
%old01 = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%old01f = bitcast <2 x i64> %old01 to <4 x float>
|
||||
%new01 = shufflevector <4 x i64> %new, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%new01f = bitcast <2 x i64> %new01 to <4 x float>
|
||||
; compute mask--note that the indices 0 and 1 are doubled-up
|
||||
%mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
|
||||
<4 x i32> <i32 0, i32 0, i32 1, i32 1>
|
||||
; and blend the two of the values
|
||||
%result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
|
||||
<4 x float> %new01f,
|
||||
<4 x float> %mask01)
|
||||
%result01 = bitcast <4 x float> %result01f to <2 x i64>
|
||||
|
||||
; and again
|
||||
%old23 = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%old23f = bitcast <2 x i64> %old23 to <4 x float>
|
||||
%new23 = shufflevector <4 x i64> %new, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%new23f = bitcast <2 x i64> %new23 to <4 x float>
|
||||
; compute mask--note that the values 2 and 3 are doubled-up
|
||||
%mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
|
||||
<4 x i32> <i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend the two of the values
|
||||
%result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
|
||||
<4 x float> %new23f,
|
||||
<4 x float> %mask23)
|
||||
%result23 = bitcast <4 x float> %result23f to <2 x i64>
|
||||
|
||||
; reconstruct the final <4 x i64> vector
|
||||
%final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
store <4 x i64> %final, <4 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(i8)
|
||||
gen_masked_store(i16)
|
||||
gen_masked_store(i32)
|
||||
gen_masked_store(i64)
|
||||
|
||||
masked_store_float_double()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
|
||||
masked_load(i8, 1)
|
||||
masked_load(i16, 2)
|
||||
masked_load(i32, 4)
|
||||
masked_load(float, 4)
|
||||
masked_load(i64, 8)
|
||||
masked_load(double, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
3491
builtins/util.m4
Normal file
3491
builtins/util.m4
Normal file
File diff suppressed because it is too large
Load Diff
4519
cbackend.cpp
Normal file
4519
cbackend.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,7 @@ syn keyword ispcStatement cbreak ccontinue creturn launch print reference soa sy
|
||||
syn keyword ispcConditional cif
|
||||
syn keyword ispcRepeat cdo cfor cwhile
|
||||
syn keyword ispcBuiltin programCount programIndex
|
||||
syn keyword ispcType export int8 int16 int32 int64
|
||||
syn keyword ispcType export uniform varying int8 int16 int32 int64
|
||||
|
||||
" Default highlighting
|
||||
command -nargs=+ HiLink hi def link <args>
|
||||
|
||||
8
contrib/ispc.vim.README
Normal file
8
contrib/ispc.vim.README
Normal file
@@ -0,0 +1,8 @@
|
||||
To install vim syntax highlighting for ispc files:
|
||||
|
||||
1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
|
||||
2) Create a filetype for ispc files to correspond to that syntax file
|
||||
To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
|
||||
|
||||
au BufRead,BufNewFile *.ispc set filetype=ispc
|
||||
|
||||
401
ctx.h
401
ctx.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -39,12 +39,15 @@
|
||||
#define ISPC_CTX_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <map>
|
||||
#include <llvm/InstrTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#ifndef LLVM_2_8
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_1)
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
#else
|
||||
#include <llvm/DebugInfo.h>
|
||||
#endif
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
|
||||
struct CFInfo;
|
||||
|
||||
@@ -59,17 +62,22 @@ struct CFInfo;
|
||||
class FunctionEmitContext {
|
||||
public:
|
||||
/** Create a new FunctionEmitContext.
|
||||
@param returnType The return type of the function
|
||||
@param function LLVM function in the current module that corresponds
|
||||
to the function
|
||||
@param function The Function object representing the function
|
||||
@param funSym Symbol that corresponds to the function
|
||||
@param llvmFunction LLVM function in the current module that corresponds
|
||||
to the function
|
||||
@param firstStmtPos Source file position of the first statement in the
|
||||
function
|
||||
*/
|
||||
FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
|
||||
FunctionEmitContext(Function *function, Symbol *funSym,
|
||||
llvm::Function *llvmFunction,
|
||||
SourcePos firstStmtPos);
|
||||
~FunctionEmitContext();
|
||||
|
||||
/** Returns the Function * corresponding to the function that we're
|
||||
currently generating code for. */
|
||||
const Function *GetFunction() const;
|
||||
|
||||
/** @name Current basic block management
|
||||
@{
|
||||
*/
|
||||
@@ -83,20 +91,33 @@ public:
|
||||
/** @name Mask management
|
||||
@{
|
||||
*/
|
||||
/** Returns the current mask value */
|
||||
llvm::Value *GetMask();
|
||||
/** Returns the mask value at entry to the current function. */
|
||||
llvm::Value *GetFunctionMask();
|
||||
|
||||
/** Returns the mask value corresponding to "varying" control flow
|
||||
within the current function. (i.e. this doesn't include the effect
|
||||
of the mask at function entry. */
|
||||
llvm::Value *GetInternalMask();
|
||||
|
||||
/** Returns the complete current mask value--i.e. the logical AND of
|
||||
the function entry mask and the internal mask. */
|
||||
llvm::Value *GetFullMask();
|
||||
|
||||
/** Returns a pointer to storage in memory that stores the current full
|
||||
mask. */
|
||||
llvm::Value *GetFullMaskPointer();
|
||||
|
||||
/** Provides the value of the mask at function entry */
|
||||
void SetEntryMask(llvm::Value *val);
|
||||
void SetFunctionMask(llvm::Value *val);
|
||||
|
||||
/** Sets the mask to a new value */
|
||||
void SetMask(llvm::Value *val);
|
||||
/** Sets the internal mask to a new value */
|
||||
void SetInternalMask(llvm::Value *val);
|
||||
|
||||
/** Sets the mask to (oldMask & val) */
|
||||
void MaskAnd(llvm::Value *oldMask, llvm::Value *val);
|
||||
/** Sets the internal mask to (oldMask & val) */
|
||||
void SetInternalMaskAnd(llvm::Value *oldMask, llvm::Value *val);
|
||||
|
||||
/** Sets the mask to (oldMask & ~val) */
|
||||
void MaskAndNot(llvm::Value *oldMask, llvm::Value *test);
|
||||
/** Sets the internal mask to (oldMask & ~val) */
|
||||
void SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *test);
|
||||
|
||||
/** Emits a branch instruction to the basic block btrue if any of the
|
||||
lanes of current mask are on and bfalse if none are on. */
|
||||
@@ -115,9 +136,8 @@ public:
|
||||
@{
|
||||
*/
|
||||
/** Notifies the FunctionEmitContext that we're starting emission of an
|
||||
'if' statement with a uniform test. The value of the mask going
|
||||
into the 'if' statement is provided in the oldMask parameter. */
|
||||
void StartUniformIf(llvm::Value *oldMask);
|
||||
'if' statement with a uniform test. */
|
||||
void StartUniformIf();
|
||||
|
||||
/** Notifies the FunctionEmitContext that we're starting emission of an
|
||||
'if' statement with a varying test. The value of the mask going
|
||||
@@ -132,10 +152,9 @@ public:
|
||||
for a loop. Basic blocks are provides for where 'break' and
|
||||
'continue' statements should jump to (if all running lanes want to
|
||||
break or continue), uniformControlFlow indicates whether the loop
|
||||
condition is 'uniform', and oldMask provides the current mask going
|
||||
into the loop. */
|
||||
condition is 'uniform'. */
|
||||
void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
|
||||
bool uniformControlFlow, llvm::Value *oldMask);
|
||||
bool uniformControlFlow);
|
||||
|
||||
/** Informs FunctionEmitContext of the value of the mask at the start
|
||||
of a loop body. */
|
||||
@@ -145,6 +164,12 @@ public:
|
||||
finished. */
|
||||
void EndLoop();
|
||||
|
||||
/** Indicates that code generation for a 'foreach', 'foreach_tiled',
|
||||
'foreach_active', or 'foreach_unique' loop is about to start. */
|
||||
enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
|
||||
void StartForeach(ForeachType ft);
|
||||
void EndForeach();
|
||||
|
||||
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
|
||||
is true, then if we're in a 'varying' loop, code will be emitted to
|
||||
see if all of the lanes want to break, in which case a jump to the
|
||||
@@ -165,10 +190,73 @@ public:
|
||||
previous iteration. */
|
||||
void RestoreContinuedLanes();
|
||||
|
||||
/** Indicates that code generation for a "switch" statement is about to
|
||||
start. isUniform indicates whether the "switch" value is uniform,
|
||||
and bbAfterSwitch gives the basic block immediately following the
|
||||
"switch" statement. (For example, if the switch condition is
|
||||
uniform, we jump here upon executing a "break" statement.) */
|
||||
void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
|
||||
/** Indicates the end of code generation for a "switch" statement. */
|
||||
void EndSwitch();
|
||||
|
||||
/** Emits code for a "switch" statement in the program.
|
||||
@param expr Gives the value of the expression after the "switch"
|
||||
@param defaultBlock Basic block to execute for the "default" case. This
|
||||
should be NULL if there is no "default" label inside
|
||||
the switch.
|
||||
@param caseBlocks vector that stores the mapping from label values
|
||||
after "case" statements to basic blocks corresponding
|
||||
to the "case" labels.
|
||||
@param nextBlocks For each basic block for a "case" or "default"
|
||||
label, this gives the basic block for the
|
||||
immediately-following "case" or "default" label (or
|
||||
the basic block after the "switch" statement for the
|
||||
last label.)
|
||||
*/
|
||||
void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
|
||||
|
||||
/** Generates code for a "default" label after a "switch" statement.
|
||||
The checkMask parameter indicates whether additional code should be
|
||||
generated to check to see if the execution mask is all off after
|
||||
the default label (in which case a jump to the following label will
|
||||
be issued. */
|
||||
void EmitDefaultLabel(bool checkMask, SourcePos pos);
|
||||
|
||||
/** Generates code for a "case" label after a "switch" statement. See
|
||||
the documentation for EmitDefaultLabel() for discussion of the
|
||||
checkMask parameter. */
|
||||
void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
|
||||
|
||||
/** Returns the current number of nested levels of 'varying' control
|
||||
flow */
|
||||
int VaryingCFDepth() const;
|
||||
|
||||
bool InForeachLoop() const;
|
||||
|
||||
/** Temporarily disables emission of performance warnings from gathers
|
||||
and scatters from subsequent code. */
|
||||
void DisableGatherScatterWarnings();
|
||||
|
||||
/** Reenables emission of gather/scatter performance warnings. */
|
||||
void EnableGatherScatterWarnings();
|
||||
|
||||
void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
|
||||
|
||||
/** Step through the code and find label statements; create a basic
|
||||
block for each one, so that subsequent calls to
|
||||
GetLabeledBasicBlock() return the corresponding basic block. */
|
||||
void InitializeLabelMap(Stmt *code);
|
||||
|
||||
/** If there is a label in the function with the given name, return the
|
||||
new basic block that it starts. */
|
||||
llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
|
||||
|
||||
/** Returns a vector of all labels in the context. This is
|
||||
simply the key set of the labelMap */
|
||||
std::vector<std::string> GetLabels();
|
||||
|
||||
/** Called to generate code for 'return' statement; value is the
|
||||
expression in the return statement (if non-NULL), and
|
||||
doCoherenceCheck indicates whether instructions should be generated
|
||||
@@ -189,7 +277,11 @@ public:
|
||||
llvm::Value *All(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i32 value wherein the i'th bit is on if and only if the i'th lane
|
||||
i1 value that indicates if all of the mask lanes are off. */
|
||||
llvm::Value *None(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i64 value wherein the i'th bit is on if and only if the i'th lane
|
||||
of the mask is on. */
|
||||
llvm::Value *LaneMask(llvm::Value *mask);
|
||||
|
||||
@@ -210,16 +302,6 @@ public:
|
||||
i32. */
|
||||
llvm::Value *I1VecToBoolVec(llvm::Value *b);
|
||||
|
||||
/** Emit code to call the user-supplied ISPCMalloc function to
|
||||
allocate space for an object of thee given type. Returns the
|
||||
pointer value returned by the ISPCMalloc call. */
|
||||
llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0);
|
||||
|
||||
/** Emit code to call the user-supplied ISPCFree function, passing it
|
||||
the given pointer to storage previously allocated by an
|
||||
EmitMalloc() call. */
|
||||
void EmitFree(llvm::Value *ptr);
|
||||
|
||||
/** If the user has asked to compile the program with instrumentation,
|
||||
this inserts a callback to the user-supplied instrumentation
|
||||
function at the current point in the code. */
|
||||
@@ -265,7 +347,7 @@ public:
|
||||
|
||||
/** Emits debugging information for the function parameter represented
|
||||
by sym. */
|
||||
void EmitFunctionParameterDebugInfo(Symbol *sym);
|
||||
void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
|
||||
/** @} */
|
||||
|
||||
/** @name IR instruction emission
|
||||
@@ -303,43 +385,71 @@ public:
|
||||
llvm::CmpInst::Predicate pred,
|
||||
llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
|
||||
|
||||
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
/** Given a scalar value, return a vector of the same type (or an
|
||||
array, for pointer types). */
|
||||
llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
|
||||
|
||||
llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
/** This GEP method is a generalization of the standard one in LLVM; it
|
||||
supports both uniform and varying basePtr values (an array of
|
||||
pointers) as well as uniform and varying index values (arrays of
|
||||
indices). */
|
||||
/** Given two integer-typed values (but possibly one vector and the
|
||||
other not, and or of possibly-different bit-widths), update their
|
||||
values as needed so that the two have the same (more general)
|
||||
type. */
|
||||
void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
|
||||
|
||||
/** Create a new slice pointer out of the given pointer to an soa type
|
||||
and an integer offset to a slice within that type. */
|
||||
llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
|
||||
|
||||
/** These GEP methods are generalizations of the standard ones in LLVM;
|
||||
they support both uniform and varying basePtr values as well as
|
||||
uniform and varying index values (arrays of indices). Varying base
|
||||
pointers are expected to come in as vectors of i32/i64 (depending
|
||||
on the target), since LLVM doesn't currently support vectors of
|
||||
pointers. The underlying type of the base pointer must be provided
|
||||
via the ptrType parameter */
|
||||
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
|
||||
const Type *ptrType, const char *name = NULL);
|
||||
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
|
||||
llvm::Value *index1, const char *name = NULL);
|
||||
|
||||
/** This is a convenience method to generate a GEP instruction with
|
||||
indices with values with known constant values as the ispc program
|
||||
is being compiled. */
|
||||
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
|
||||
llvm::Value *index1, const Type *ptrType,
|
||||
const char *name = NULL);
|
||||
|
||||
/** Load from the memory location(s) given by lvalue. The lvalue may
|
||||
be varying, in which case this corresponds to a gather from the
|
||||
multiple memory locations given by the array of pointer values
|
||||
given by the lvalue. If the lvalue is not varying, then the type
|
||||
parameter may be NULL. */
|
||||
llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type,
|
||||
const char *name = NULL);
|
||||
/** This method returns a new pointer that represents offsetting the
|
||||
given base pointer to point at the given element number of the
|
||||
structure type that the base pointer points to. (The provided
|
||||
pointer must be a pointer to a structure type. The ptrType gives
|
||||
the type of the pointer, though it may be NULL if the base pointer
|
||||
is uniform. */
|
||||
llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
|
||||
const Type *ptrType, const char *name = NULL,
|
||||
const PointerType **resultPtrType = NULL);
|
||||
|
||||
/** Load from the memory location(s) given by lvalue, using the given
|
||||
mask. The lvalue may be varying, in which case this corresponds to
|
||||
a gather from the multiple memory locations given by the array of
|
||||
pointer values given by the lvalue. If the lvalue is not varying,
|
||||
then both the mask pointer and the type pointer may be NULL. */
|
||||
llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
|
||||
const Type *ptrType, const char *name = NULL);
|
||||
|
||||
llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);
|
||||
|
||||
/** Emits an alloca instruction to allocate stack storage for the given
|
||||
type. If a non-zero alignment is specified, the object is also
|
||||
@@ -347,21 +457,27 @@ public:
|
||||
instruction is added at the start of the function in the entry
|
||||
basic block; if it should be added to the current basic block, then
|
||||
the atEntryBlock parameter should be false. */
|
||||
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
|
||||
int align = 0, bool atEntryBlock = true);
|
||||
llvm::Value *AllocaInst(llvm::Type *llvmType,
|
||||
const char *name = NULL, int align = 0,
|
||||
bool atEntryBlock = true);
|
||||
|
||||
/** Standard store instruction; for this variant, the lvalue must be a
|
||||
single pointer, not a varying lvalue. */
|
||||
void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
const char *name = NULL);
|
||||
void StoreInst(llvm::Value *value, llvm::Value *ptr);
|
||||
|
||||
/** In this variant of StoreInst(), the lvalue may be varying. If so,
|
||||
this corresponds to a scatter. Whether the lvalue is uniform of
|
||||
varying, the given storeMask is used to mask the stores so that
|
||||
they only execute for the active program instances. */
|
||||
void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
llvm::Value *storeMask, const Type *rvalueType,
|
||||
const char *name = NULL);
|
||||
void StoreInst(llvm::Value *value, llvm::Value *ptr,
|
||||
llvm::Value *storeMask, const Type *valueType,
|
||||
const Type *ptrType);
|
||||
|
||||
/** Copy count bytes of memory from the location pointed to by src to
|
||||
the location pointed to by dest. (src and dest must not be
|
||||
overlapping.) */
|
||||
void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
|
||||
llvm::Value *align = NULL);
|
||||
|
||||
void BranchInst(llvm::BasicBlock *block);
|
||||
void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
|
||||
@@ -378,33 +494,48 @@ public:
|
||||
llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
||||
llvm::PHINode *PhiNode(llvm::Type *type, int count,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
|
||||
llvm::Value *val1, const char *name = NULL);
|
||||
|
||||
llvm::Instruction *CallInst(llvm::Function *func,
|
||||
const std::vector<llvm::Value *> &args,
|
||||
const char *name = NULL);
|
||||
/** Emits IR to do a function call with the given arguments. If the
|
||||
function type is a varying function pointer type, its full type
|
||||
must be provided in funcType. funcType can be NULL if func is a
|
||||
uniform function pointer. */
|
||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
||||
const std::vector<llvm::Value *> &args,
|
||||
const char *name = NULL);
|
||||
|
||||
/** This is a convenience method that issues a call instruction to a
|
||||
function that takes just a single argument. */
|
||||
llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg,
|
||||
const char *name = NULL);
|
||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
||||
llvm::Value *arg, const char *name = NULL);
|
||||
|
||||
/** This is a convenience method that issues a call instruction to a
|
||||
function that takes two arguments. */
|
||||
llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0,
|
||||
llvm::Value *arg1, const char *name = NULL);
|
||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
||||
llvm::Value *arg0, llvm::Value *arg1,
|
||||
const char *name = NULL);
|
||||
|
||||
/** Launch an asynchronous task to run the given function, passing it
|
||||
he given argument values. */
|
||||
llvm::Instruction *LaunchInst(llvm::Function *callee,
|
||||
std::vector<llvm::Value *> &argVals);
|
||||
llvm::Value *LaunchInst(llvm::Value *callee,
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount);
|
||||
|
||||
void SyncInst();
|
||||
|
||||
llvm::Instruction *ReturnInst();
|
||||
/** @} */
|
||||
|
||||
private:
|
||||
/** Pointer to the Function for which we're currently generating code. */
|
||||
Function *function;
|
||||
|
||||
/** LLVM function representation for the current function. */
|
||||
llvm::Function *llvmFunction;
|
||||
|
||||
/** The basic block into which we add any alloca instructions that need
|
||||
to go at the very start of the function. */
|
||||
llvm::BasicBlock *allocaBlock;
|
||||
@@ -414,8 +545,16 @@ private:
|
||||
llvm::BasicBlock *bblock;
|
||||
|
||||
/** Pointer to stack-allocated memory that stores the current value of
|
||||
the program mask. */
|
||||
llvm::Value *maskPtr;
|
||||
the full program mask. */
|
||||
llvm::Value *fullMaskPointer;
|
||||
|
||||
/** Pointer to stack-allocated memory that stores the current value of
|
||||
the program mask representing varying control flow within the
|
||||
function. */
|
||||
llvm::Value *internalMaskPointer;
|
||||
|
||||
/** Value of the program mask when the function starts execution. */
|
||||
llvm::Value *functionMaskValue;
|
||||
|
||||
/** Current source file position; if debugging information is being
|
||||
generated, this position is used to set file/line information for
|
||||
@@ -426,20 +565,14 @@ private:
|
||||
for error messages and debugging symbols. */
|
||||
SourcePos funcStartPos;
|
||||
|
||||
/** Type of result that the current function returns. */
|
||||
const Type *returnType;
|
||||
|
||||
/** Value of the program mask when the function starts execution. */
|
||||
llvm::Value *entryMask;
|
||||
|
||||
/** If currently in a loop body, the value of the mask at the start of
|
||||
the loop. */
|
||||
llvm::Value *loopMask;
|
||||
|
||||
/** If currently in a loop body, this is a pointer to memory to store a
|
||||
mask value that represents which of the lanes have executed a
|
||||
'break' statement. If we're not in a loop body, this should be
|
||||
NULL. */
|
||||
/** If currently in a loop body or switch statement, this is a pointer
|
||||
to memory to store a mask value that represents which of the lanes
|
||||
have executed a 'break' statement. If we're not in a loop body or
|
||||
switch, this should be NULL. */
|
||||
llvm::Value *breakLanesPtr;
|
||||
|
||||
/** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
|
||||
@@ -447,16 +580,49 @@ private:
|
||||
'continue' statement. */
|
||||
llvm::Value *continueLanesPtr;
|
||||
|
||||
/** If we're inside a loop, this gives the basic block immediately
|
||||
after the current loop, which we will jump to if all of the lanes
|
||||
have executed a break statement or are otherwise done with the
|
||||
loop. */
|
||||
/** If we're inside a loop or switch statement, this gives the basic
|
||||
block immediately after the current loop or switch, which we will
|
||||
jump to if all of the lanes have executed a break statement or are
|
||||
otherwise done with it. */
|
||||
llvm::BasicBlock *breakTarget;
|
||||
|
||||
/** If we're inside a loop, this gives the block to jump to if all of
|
||||
the running lanes have executed a 'continue' statement. */
|
||||
llvm::BasicBlock *continueTarget;
|
||||
|
||||
/** @name Switch statement state
|
||||
|
||||
These variables store various state that's active when we're
|
||||
generating code for a switch statement. They should all be NULL
|
||||
outside of a switch.
|
||||
@{
|
||||
*/
|
||||
|
||||
/** The value of the expression used to determine which case in the
|
||||
statements after the switch to execute. */
|
||||
llvm::Value *switchExpr;
|
||||
|
||||
/** Map from case label numbers to the basic block that will hold code
|
||||
for that case. */
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
|
||||
|
||||
/** The basic block of code to run for the "default" label in the
|
||||
switch statement. */
|
||||
llvm::BasicBlock *defaultBlock;
|
||||
|
||||
/** For each basic block for the code for cases (and the default label,
|
||||
if present), this map gives the basic block for the immediately
|
||||
following case/default label. */
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
|
||||
|
||||
/** Records whether the switch condition was uniform; this is a
|
||||
distinct notion from whether the switch represents uniform or
|
||||
varying control flow; we may have varying control flow from a
|
||||
uniform switch condition if there is a 'break' inside the switch
|
||||
that's under varying control flow. */
|
||||
bool switchConditionWasUniform;
|
||||
/** @} */
|
||||
|
||||
/** A pointer to memory that records which of the program instances
|
||||
have executed a 'return' statement (and are thus really truly done
|
||||
running any more instructions in this functions. */
|
||||
@@ -475,12 +641,12 @@ private:
|
||||
std::vector<CFInfo *> controlFlowInfo;
|
||||
|
||||
/** DIFile object corresponding to the source file where the current
|
||||
function was defined (used for debugging info0. */
|
||||
function was defined (used for debugging info). */
|
||||
llvm::DIFile diFile;
|
||||
|
||||
/** DISubprogram corresponding to this function (used for debugging
|
||||
info). */
|
||||
llvm::DISubprogram diFunction;
|
||||
llvm::DISubprogram diSubprogram;
|
||||
|
||||
/** These correspond to the current set of nested scopes in the
|
||||
function. */
|
||||
@@ -489,20 +655,49 @@ private:
|
||||
/** True if a 'launch' statement has been encountered in the function. */
|
||||
bool launchedTasks;
|
||||
|
||||
/** This is a pointer to a void * that is passed to the ISPCLaunch(),
|
||||
ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
|
||||
tasks launched from the current function. */
|
||||
llvm::Value *launchGroupHandlePtr;
|
||||
|
||||
/** Nesting count of the number of times calling code has disabled (and
|
||||
not yet reenabled) gather/scatter performance warnings. */
|
||||
int disableGSWarningCount;
|
||||
|
||||
std::map<std::string, llvm::BasicBlock *> labelMap;
|
||||
|
||||
static bool initLabelBBlocks(ASTNode *node, void *data);
|
||||
|
||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||
static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
|
||||
bool ifsInLoopAllUniform() const;
|
||||
static void addGSMetadata(llvm::Value *inst, SourcePos pos);
|
||||
bool ifsInCFAllUniform(int cfType) const;
|
||||
void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
|
||||
llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
|
||||
|
||||
void restoreMaskGivenReturns(llvm::Value *oldMask);
|
||||
llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
|
||||
const Type *ptrType);
|
||||
|
||||
void scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
llvm::Value *maskPtr, const Type *rvalueType);
|
||||
llvm::Value *gather(llvm::Value *lvalue, const Type *type,
|
||||
const char *name);
|
||||
void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
const Type *rvalueType, llvm::Value *maskPtr);
|
||||
void restoreMaskGivenReturns(llvm::Value *oldMask);
|
||||
void addSwitchMaskCheck(llvm::Value *mask);
|
||||
bool inSwitchStatement() const;
|
||||
llvm::Value *getMaskAtSwitchEntry();
|
||||
|
||||
CFInfo *popCFState();
|
||||
|
||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
|
||||
const Type *ptrType, llvm::Value *mask);
|
||||
void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr,
|
||||
llvm::Value *mask, const Type *valueType,
|
||||
const PointerType *ptrType);
|
||||
llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
|
||||
const PointerType *ptrType, const char *name);
|
||||
|
||||
llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
|
||||
llvm::Value *mask, const char *name);
|
||||
|
||||
llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
|
||||
};
|
||||
|
||||
#endif // ISPC_CTX_H
|
||||
|
||||
804
decl.cpp
804
decl.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -33,15 +33,87 @@
|
||||
|
||||
/** @file decl.cpp
|
||||
@brief Implementations of classes related to turning declarations into
|
||||
symbols and types.
|
||||
symbol names and types.
|
||||
*/
|
||||
|
||||
#include "decl.h"
|
||||
#include "util.h"
|
||||
#include "module.h"
|
||||
#include "sym.h"
|
||||
#include "type.h"
|
||||
#include "stmt.h"
|
||||
#include "expr.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <set>
|
||||
|
||||
static void
|
||||
lPrintTypeQualifiers(int typeQualifiers) {
|
||||
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
|
||||
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
|
||||
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
|
||||
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
|
||||
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
if (typeQualifiers & TYPEQUAL_EXPORT) printf("export ");
|
||||
if (typeQualifiers & TYPEQUAL_UNMASKED) printf("unmasked ");
|
||||
}
|
||||
|
||||
|
||||
/** Given a Type and a set of type qualifiers, apply the type qualifiers to
|
||||
the type, returning the type that is the result.
|
||||
*/
|
||||
static const Type *
|
||||
lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
if (type == NULL)
|
||||
return NULL;
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
|
||||
if (Type::Equal(type, AtomicType::Void))
|
||||
Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
|
||||
else
|
||||
type = type->GetAsUniformType();
|
||||
}
|
||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
|
||||
if (Type::Equal(type, AtomicType::Void))
|
||||
Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
|
||||
else
|
||||
type = type->GetAsVaryingType();
|
||||
}
|
||||
else
|
||||
if (Type::Equal(type, AtomicType::Void) == false)
|
||||
type = type->GetAsUnboundVariabilityType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
|
||||
Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
|
||||
"qualifiers.");
|
||||
|
||||
const Type *unsignedType = type->GetAsUnsignedType();
|
||||
if (unsignedType != NULL)
|
||||
type = unsignedType;
|
||||
else {
|
||||
const Type *resolvedType =
|
||||
type->ResolveUnboundVariability(Variability::Varying);
|
||||
Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
||||
resolvedType->GetString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
|
||||
const Type *resolvedType =
|
||||
type->ResolveUnboundVariability(Variability::Varying);
|
||||
Error(pos, "\"signed\" qualifier is illegal with non-integer type "
|
||||
"\"%s\".", resolvedType->GetString().c_str());
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// DeclSpecs
|
||||
@@ -49,302 +121,588 @@
|
||||
DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
|
||||
baseType = t;
|
||||
storageClass = sc;
|
||||
typeQualifier = tq;
|
||||
typeQualifiers = tq;
|
||||
soaWidth = 0;
|
||||
vectorSize = 0;
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
DeclSpecs::GetBaseType(SourcePos pos) const {
|
||||
const Type *retType = baseType;
|
||||
|
||||
if (retType == NULL) {
|
||||
Warning(pos, "No type specified in declaration. Assuming int32.");
|
||||
retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
|
||||
}
|
||||
|
||||
if (vectorSize > 0) {
|
||||
const AtomicType *atomicType = CastType<AtomicType>(retType);
|
||||
if (atomicType == NULL) {
|
||||
Error(pos, "Only atomic types (int, float, ...) are legal for vector "
|
||||
"types.");
|
||||
return NULL;
|
||||
}
|
||||
retType = new VectorType(atomicType, vectorSize);
|
||||
}
|
||||
|
||||
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
|
||||
|
||||
if (soaWidth > 0) {
|
||||
const StructType *st = CastType<StructType>(retType);
|
||||
|
||||
if (st == NULL) {
|
||||
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||
"type \"%s\".", soaWidth, retType->GetString().c_str());
|
||||
return NULL;
|
||||
}
|
||||
else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
|
||||
Error(pos, "soa<%d> width illegal. Value must be positive power "
|
||||
"of two.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (st->IsUniformType()) {
|
||||
Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
|
||||
"both be used in a type declaration.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else if (st->IsVaryingType()) {
|
||||
Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
|
||||
"both be used in a type declaration.", soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
retType = st->GetAsSOAType(soaWidth);
|
||||
|
||||
if (soaWidth < g->target.vectorWidth)
|
||||
PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
|
||||
"currently leads to inefficient code to access "
|
||||
"soa types.", soaWidth, g->target.vectorWidth);
|
||||
}
|
||||
|
||||
return retType;
|
||||
}
|
||||
|
||||
|
||||
static const char *
|
||||
lGetStorageClassName(StorageClass storageClass) {
|
||||
switch (storageClass) {
|
||||
case SC_NONE: return "";
|
||||
case SC_EXTERN: return "extern";
|
||||
case SC_EXTERN_C: return "extern \"C\"";
|
||||
case SC_STATIC: return "static";
|
||||
case SC_TYPEDEF: return "typedef";
|
||||
default: FATAL("Unhandled storage class in lGetStorageClassName");
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DeclSpecs::Print() const {
|
||||
if (storageClass == SC_EXTERN) printf("extern ");
|
||||
if (storageClass == SC_EXTERN_C) printf("extern \"C\" ");
|
||||
if (storageClass == SC_EXPORT) printf("export ");
|
||||
if (storageClass == SC_STATIC) printf("static ");
|
||||
if (storageClass == SC_TYPEDEF) printf("typedef ");
|
||||
printf("Declspecs: [%s ", lGetStorageClassName(storageClass));
|
||||
|
||||
if (soaWidth > 0) printf("soa<%d> ", soaWidth);
|
||||
|
||||
if (typeQualifier & TYPEQUAL_INLINE) printf("inline ");
|
||||
if (typeQualifier & TYPEQUAL_CONST) printf("const ");
|
||||
if (typeQualifier & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||
if (typeQualifier & TYPEQUAL_VARYING) printf("varying ");
|
||||
if (typeQualifier & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference ");
|
||||
if (typeQualifier & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
|
||||
printf("%s", baseType->GetString().c_str());
|
||||
lPrintTypeQualifiers(typeQualifiers);
|
||||
printf("base type: %s", baseType->GetString().c_str());
|
||||
|
||||
if (vectorSize > 0) printf("<%d>", vectorSize);
|
||||
printf("]");
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Declarator
|
||||
|
||||
Declarator::Declarator(Symbol *s, SourcePos p)
|
||||
: pos(p) {
|
||||
sym = s;
|
||||
functionArgs = NULL;
|
||||
isFunction = false;
|
||||
Declarator::Declarator(DeclaratorKind dk, SourcePos p)
|
||||
: pos(p), kind(dk) {
|
||||
child = NULL;
|
||||
typeQualifiers = 0;
|
||||
storageClass = SC_NONE;
|
||||
arraySize = -1;
|
||||
type = NULL;
|
||||
initExpr = NULL;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declarator::AddArrayDimension(int size) {
|
||||
assert(size > 0 || size == -1); // -1 -> unsized
|
||||
arraySize.push_back(size);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
|
||||
sym->type = GetType(ds);
|
||||
const Type *baseType = ds->GetBaseType(pos);
|
||||
InitFromType(baseType, ds);
|
||||
|
||||
if (ds->storageClass == SC_STATIC)
|
||||
sym->isStatic = true;
|
||||
if (type == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
storageClass = ds->storageClass;
|
||||
|
||||
if (ds->declSpecList.size() > 0 &&
|
||||
CastType<FunctionType>(type) == NULL) {
|
||||
Error(pos, "__declspec specifiers for non-function type \"%s\" are "
|
||||
"not used.", type->GetString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declarator::Print() const {
|
||||
printf("%s", sym->name.c_str());
|
||||
Declarator::Print(int indent) const {
|
||||
printf("%*cdeclarator: [", indent, ' ');
|
||||
pos.Print();
|
||||
|
||||
lPrintTypeQualifiers(typeQualifiers);
|
||||
printf("%s ", lGetStorageClassName(storageClass));
|
||||
if (name.size() > 0)
|
||||
printf("%s", name.c_str());
|
||||
else
|
||||
printf("(unnamed)");
|
||||
|
||||
printf(", array size = %d", arraySize);
|
||||
|
||||
printf(", kind = ");
|
||||
switch (kind) {
|
||||
case DK_BASE: printf("base"); break;
|
||||
case DK_POINTER: printf("pointer"); break;
|
||||
case DK_REFERENCE: printf("reference"); break;
|
||||
case DK_ARRAY: printf("array"); break;
|
||||
case DK_FUNCTION: printf("function"); break;
|
||||
default: FATAL("Unhandled declarator kind");
|
||||
}
|
||||
|
||||
if (initExpr != NULL) {
|
||||
printf(" = (");
|
||||
initExpr->Print();
|
||||
printf(")");
|
||||
}
|
||||
pos.Print();
|
||||
|
||||
if (functionParams.size() > 0) {
|
||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
||||
printf("\n%*cfunc param %d:\n", indent, ' ', i);
|
||||
functionParams[i]->Print(indent+4);
|
||||
}
|
||||
}
|
||||
|
||||
if (child != NULL)
|
||||
child->Print(indent + 4);
|
||||
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
|
||||
static const Type *
|
||||
lGetType(const Declarator *decl, DeclSpecs *ds,
|
||||
std::vector<int>::const_iterator arrayIter) {
|
||||
if (arrayIter == decl->arraySize.end()) {
|
||||
// If we don't have an array (or have processed all of the array
|
||||
// dimensions in previous recursive calls), we can go ahead and
|
||||
// figure out the final non-array type we have here.
|
||||
const Type *type = ds->baseType;
|
||||
if (type == NULL) {
|
||||
Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".",
|
||||
decl->sym->name.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Account for 'unsigned' and 'const' qualifiers in the type
|
||||
if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) {
|
||||
const Type *unsignedType = type->GetAsUnsignedType();
|
||||
if (unsignedType != NULL)
|
||||
type = unsignedType;
|
||||
else
|
||||
Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
||||
type->GetString().c_str());
|
||||
}
|
||||
if ((ds->typeQualifier & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
if (ds->vectorSize > 0) {
|
||||
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
|
||||
if (atomicType == NULL) {
|
||||
Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector "
|
||||
"types.");
|
||||
return NULL;
|
||||
}
|
||||
type = new VectorType(atomicType, ds->vectorSize);
|
||||
}
|
||||
|
||||
// if uniform/varying is specified explicitly, then go with that
|
||||
if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0)
|
||||
return type->GetAsUniformType();
|
||||
else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0)
|
||||
return type->GetAsVaryingType();
|
||||
else {
|
||||
// otherwise, structs are uniform by default and everything
|
||||
// else is varying by default
|
||||
if (dynamic_cast<const StructType *>(type) != NULL)
|
||||
return type->GetAsUniformType();
|
||||
else
|
||||
return type->GetAsVaryingType();
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Peel off one dimension of the array
|
||||
int arraySize = *arrayIter;
|
||||
++arrayIter;
|
||||
|
||||
// Get the type, not including the arraySize dimension peeled off
|
||||
// above.
|
||||
const Type *childType = lGetType(decl, ds, arrayIter);
|
||||
|
||||
int soaWidth = ds->soaWidth;
|
||||
if (soaWidth == 0)
|
||||
// If there's no "soa<n>" stuff going on, just return a regular
|
||||
// array with the appropriate size
|
||||
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
||||
else {
|
||||
// Make sure we actually have an array of structs ..
|
||||
const StructType *childStructType =
|
||||
dynamic_cast<const StructType *>(childType);
|
||||
if (childStructType == NULL) {
|
||||
Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||
"type \"%s\".", soaWidth, childType->GetString().c_str());
|
||||
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
||||
}
|
||||
else if ((soaWidth & (soaWidth - 1)) != 0) {
|
||||
Error(decl->pos, "soa<%d> width illegal. Value must be power of two.",
|
||||
soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
|
||||
Error(decl->pos, "soa<%d> width must evenly divide array size %d.",
|
||||
soaWidth, arraySize);
|
||||
return NULL;
|
||||
}
|
||||
return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
|
||||
soaWidth);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Declarator::GetType(DeclSpecs *ds) const {
|
||||
bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0);
|
||||
bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0);
|
||||
bool isTask = ((ds->typeQualifier & TYPEQUAL_TASK) != 0);
|
||||
bool isReference = ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0);
|
||||
void
|
||||
Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
||||
bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
|
||||
bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
|
||||
bool isTask = ((typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isExported = ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
||||
bool isConst = ((typeQualifiers & TYPEQUAL_CONST) != 0);
|
||||
bool isUnmasked = ((typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
||||
|
||||
if (hasUniformQual && hasVaryingQual) {
|
||||
Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isTask) {
|
||||
Error(pos, "\"task\" qualifier illegal in variable declaration.");
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isUnmasked) {
|
||||
Error(pos, "\"unmasked\" qualifier illegal in variable declaration.");
|
||||
return;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isExported) {
|
||||
Error(pos, "\"export\" qualifier illegal in variable declaration.");
|
||||
return;
|
||||
}
|
||||
|
||||
Variability variability(Variability::Unbound);
|
||||
if (hasUniformQual)
|
||||
variability = Variability::Uniform;
|
||||
else if (hasVaryingQual)
|
||||
variability = Variability::Varying;
|
||||
|
||||
if (isFunction) {
|
||||
std::vector<const Type *> args;
|
||||
std::vector<std::string> argNames;
|
||||
if (functionArgs) {
|
||||
// Loop over the function arguments and get names and types for
|
||||
// each one in the args and argNames arrays
|
||||
for (unsigned int i = 0; i < functionArgs->size(); ++i) {
|
||||
Declaration *d = (*functionArgs)[i];
|
||||
Symbol *sym;
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for
|
||||
// the parameter
|
||||
char buf[32];
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, pos);
|
||||
Declarator *declarator = new Declarator(sym, sym->pos);
|
||||
sym->type = declarator->GetType(ds);
|
||||
d->declarators.push_back(declarator);
|
||||
}
|
||||
else {
|
||||
assert(d->declarators.size() == 1);
|
||||
sym = d->declarators[0]->sym;
|
||||
if (kind == DK_BASE) {
|
||||
// All of the type qualifiers should be in the DeclSpecs for the
|
||||
// base declarator
|
||||
AssertPos(pos, typeQualifiers == 0);
|
||||
AssertPos(pos, child == NULL);
|
||||
type = baseType;
|
||||
}
|
||||
else if (kind == DK_POINTER) {
|
||||
/* For now, any pointer to an SOA type gets the slice property; if
|
||||
we add the capability to declare pointers as slices or not,
|
||||
we'll want to set this based on a type qualifier here. */
|
||||
const Type *ptrType = new PointerType(baseType, variability, isConst,
|
||||
baseType->IsSOAType());
|
||||
if (child != NULL) {
|
||||
child->InitFromType(ptrType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
type = ptrType;
|
||||
}
|
||||
else if (kind == DK_REFERENCE) {
|
||||
if (hasUniformQual) {
|
||||
Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
|
||||
return;
|
||||
}
|
||||
if (hasVaryingQual) {
|
||||
Error(pos, "\"varying\" qualifier is illegal to apply to references.");
|
||||
return;
|
||||
}
|
||||
if (isConst) {
|
||||
Error(pos, "\"const\" qualifier is to illegal apply to references.");
|
||||
return;
|
||||
}
|
||||
// The parser should disallow this already, but double check.
|
||||
if (CastType<ReferenceType>(baseType) != NULL) {
|
||||
Error(pos, "References to references are illegal.");
|
||||
return;
|
||||
}
|
||||
|
||||
const Type *refType = new ReferenceType(baseType);
|
||||
if (child != NULL) {
|
||||
child->InitFromType(refType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
type = refType;
|
||||
}
|
||||
else if (kind == DK_ARRAY) {
|
||||
if (Type::Equal(baseType, AtomicType::Void)) {
|
||||
Error(pos, "Arrays of \"void\" type are illegal.");
|
||||
return;
|
||||
}
|
||||
if (CastType<ReferenceType>(baseType)) {
|
||||
Error(pos, "Arrays of references (type \"%s\") are illegal.",
|
||||
baseType->GetString().c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
const Type *arrayType = new ArrayType(baseType, arraySize);
|
||||
if (child != NULL) {
|
||||
child->InitFromType(arrayType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
else
|
||||
type = arrayType;
|
||||
}
|
||||
else if (kind == DK_FUNCTION) {
|
||||
llvm::SmallVector<const Type *, 8> args;
|
||||
llvm::SmallVector<std::string, 8> argNames;
|
||||
llvm::SmallVector<Expr *, 8> argDefaults;
|
||||
llvm::SmallVector<SourcePos, 8> argPos;
|
||||
|
||||
// Loop over the function arguments and store the names, types,
|
||||
// default values (if any), and source file positions each one in
|
||||
// the corresponding vector.
|
||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
||||
Declaration *d = functionParams[i];
|
||||
|
||||
if (d == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for the
|
||||
// parameter; wire up a placeholder Declarator for it
|
||||
d->declarators.push_back(new Declarator(DK_BASE, pos));
|
||||
d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
|
||||
}
|
||||
|
||||
AssertPos(pos, d->declarators.size() == 1);
|
||||
Declarator *decl = d->declarators[0];
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (decl->name == "") {
|
||||
// Give a name to any anonymous parameter declarations
|
||||
char buf[32];
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
decl->name = buf;
|
||||
}
|
||||
decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
|
||||
|
||||
if (d->declSpecs->storageClass != SC_NONE)
|
||||
Error(decl->pos, "Storage class \"%s\" is illegal in "
|
||||
"function parameter declaration for parameter \"%s\".",
|
||||
lGetStorageClassName(d->declSpecs->storageClass),
|
||||
decl->name.c_str());
|
||||
if (Type::Equal(decl->type, AtomicType::Void)) {
|
||||
Error(decl->pos, "Parameter with type \"void\" illegal in function "
|
||||
"parameter list.");
|
||||
decl->type = NULL;
|
||||
}
|
||||
|
||||
const ArrayType *at = CastType<ArrayType>(decl->type);
|
||||
if (at != NULL) {
|
||||
// As in C, arrays are passed to functions as pointers to
|
||||
// their element type. We'll just immediately make this
|
||||
// change now. (One shortcoming of losing the fact that
|
||||
// the it was originally an array is that any warnings or
|
||||
// errors later issued that print the function type will
|
||||
// report this differently than it was originally declared
|
||||
// in the function, but it's not clear that this is a
|
||||
// significant problem.)
|
||||
const Type *targetType = at->GetElementType();
|
||||
if (targetType == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
// Arrays are passed by reference, so convert array
|
||||
// parameters to be references here.
|
||||
if (dynamic_cast<const ArrayType *>(sym->type) != NULL)
|
||||
sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
|
||||
decl->type = PointerType::GetUniform(targetType);
|
||||
|
||||
args.push_back(sym->type);
|
||||
argNames.push_back(sym->name);
|
||||
// Make sure there are no unsized arrays (other than the
|
||||
// first dimension) in function parameter lists.
|
||||
at = CastType<ArrayType>(targetType);
|
||||
while (at != NULL) {
|
||||
if (at->GetElementCount() == 0)
|
||||
Error(decl->pos, "Arrays with unsized dimensions in "
|
||||
"dimensions after the first one are illegal in "
|
||||
"function parameter lists.");
|
||||
at = CastType<ArrayType>(at->GetElementType());
|
||||
}
|
||||
}
|
||||
|
||||
args.push_back(decl->type);
|
||||
argNames.push_back(decl->name);
|
||||
argPos.push_back(decl->pos);
|
||||
|
||||
Expr *init = NULL;
|
||||
// Try to find an initializer expression.
|
||||
while (decl != NULL) {
|
||||
if (decl->initExpr != NULL) {
|
||||
decl->initExpr = TypeCheck(decl->initExpr);
|
||||
decl->initExpr = Optimize(decl->initExpr);
|
||||
if (decl->initExpr != NULL) {
|
||||
init = dynamic_cast<ConstExpr *>(decl->initExpr);
|
||||
if (init == NULL)
|
||||
init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
|
||||
if (init == NULL)
|
||||
Error(decl->initExpr->pos, "Default value for parameter "
|
||||
"\"%s\" must be a compile-time constant.",
|
||||
decl->name.c_str());
|
||||
}
|
||||
break;
|
||||
}
|
||||
else
|
||||
decl = decl->child;
|
||||
}
|
||||
argDefaults.push_back(init);
|
||||
}
|
||||
|
||||
const Type *returnType = baseType;
|
||||
if (returnType == NULL) {
|
||||
Error(pos, "No return type provided in function declaration.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (CastType<FunctionType>(returnType) != NULL) {
|
||||
Error(pos, "Illegal to return function type from function.");
|
||||
return;
|
||||
}
|
||||
|
||||
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
|
||||
|
||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
||||
bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
||||
|
||||
if (isExported && isTask) {
|
||||
Error(pos, "Function can't have both \"task\" and \"export\" "
|
||||
"qualifiers");
|
||||
return;
|
||||
}
|
||||
if (isExternC && isTask) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
|
||||
"qualifiers");
|
||||
return;
|
||||
}
|
||||
if (isExternC && isExported) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
|
||||
"qualifiers");
|
||||
return;
|
||||
}
|
||||
if (isUnmasked && isExported)
|
||||
Warning(pos, "\"unmasked\" qualifier is redundant for exported "
|
||||
"functions.");
|
||||
|
||||
if (child == NULL) {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
const FunctionType *functionType =
|
||||
new FunctionType(returnType, args, argNames, argDefaults,
|
||||
argPos, isTask, isExported, isExternC, isUnmasked);
|
||||
|
||||
// handle any explicit __declspecs on the function
|
||||
if (ds != NULL) {
|
||||
for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
|
||||
std::string str = ds->declSpecList[i].first;
|
||||
SourcePos pos = ds->declSpecList[i].second;
|
||||
|
||||
if (str == "safe")
|
||||
(const_cast<FunctionType *>(functionType))->isSafe = true;
|
||||
else if (!strncmp(str.c_str(), "cost", 4)) {
|
||||
int cost = atoi(str.c_str() + 4);
|
||||
if (cost < 0)
|
||||
Error(pos, "Negative function cost %d is illegal.",
|
||||
cost);
|
||||
(const_cast<FunctionType *>(functionType))->costOverride = cost;
|
||||
}
|
||||
else
|
||||
Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (ds->baseType == NULL) {
|
||||
Warning(pos, "No return type provided in declaration of function \"%s\". "
|
||||
"Treating as \"void\".", sym->name.c_str());
|
||||
ds->baseType = AtomicType::Void;
|
||||
}
|
||||
|
||||
if (isReference) {
|
||||
Error(pos, "Function return types can't be reference types.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const Type *returnType = lGetType(this, ds, arraySize.begin());
|
||||
if (returnType == NULL)
|
||||
return NULL;
|
||||
|
||||
bool isExported = (ds->storageClass == SC_EXPORT);
|
||||
bool isExternC = (ds->storageClass == SC_EXTERN_C);
|
||||
return new FunctionType(returnType, args, pos, &argNames, isTask,
|
||||
isExported, isExternC);
|
||||
}
|
||||
else {
|
||||
if (isTask)
|
||||
Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".",
|
||||
sym->name.c_str());
|
||||
|
||||
const Type *type = lGetType(this, ds, arraySize.begin());
|
||||
|
||||
if (type != NULL && isReference) {
|
||||
bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0);
|
||||
type = new ReferenceType(type, hasConstQual);
|
||||
}
|
||||
|
||||
return type;
|
||||
child->InitFromType(functionType, ds);
|
||||
type = child->type;
|
||||
name = child->name;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Declaration
|
||||
|
||||
void
|
||||
Declaration::AddSymbols(SymbolTable *st) const {
|
||||
assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||
|
||||
Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
|
||||
declSpecs = ds;
|
||||
if (dlist != NULL)
|
||||
declarators = *dlist;
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i)
|
||||
if (declarators[i])
|
||||
st->AddVariable(declarators[i]->sym);
|
||||
if (declarators[i] != NULL)
|
||||
declarators[i]->InitFromDeclSpecs(declSpecs);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declaration::Print() const {
|
||||
printf("Declaration: specs [");
|
||||
declSpecs->Print();
|
||||
printf("], declarators [");
|
||||
for (unsigned int i = 0 ; i < declarators.size(); ++i) {
|
||||
declarators[i]->Print();
|
||||
printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
|
||||
Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
|
||||
declSpecs = ds;
|
||||
if (d != NULL) {
|
||||
d->InitFromDeclSpecs(ds);
|
||||
declarators.push_back(d);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::vector<VariableDeclaration>
|
||||
Declaration::GetVariableDeclarations() const {
|
||||
Assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||
std::vector<VariableDeclaration> vars;
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
// Ignore earlier errors
|
||||
Assert(m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Type::Equal(decl->type, AtomicType::Void))
|
||||
Error(decl->pos, "\"void\" type variable illegal in declaration.");
|
||||
else if (CastType<FunctionType>(decl->type) == NULL) {
|
||||
decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
|
||||
Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
|
||||
decl->storageClass);
|
||||
m->symbolTable->AddVariable(sym);
|
||||
vars.push_back(VariableDeclaration(sym, decl->initExpr));
|
||||
}
|
||||
}
|
||||
|
||||
return vars;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declaration::DeclareFunctions() {
|
||||
Assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL || decl->type == NULL) {
|
||||
// Ignore earlier errors
|
||||
Assert(m->errorCount > 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
const FunctionType *ftype = CastType<FunctionType>(decl->type);
|
||||
if (ftype == NULL)
|
||||
continue;
|
||||
|
||||
bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
|
||||
m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
|
||||
isInline, decl->pos);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declaration::Print(int indent) const {
|
||||
printf("%*cDeclaration: specs [", indent, ' ');
|
||||
declSpecs->Print();
|
||||
printf("], declarators:\n");
|
||||
for (unsigned int i = 0 ; i < declarators.size(); ++i)
|
||||
declarators[i]->Print(indent+4);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void
|
||||
GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions) {
|
||||
llvm::SmallVector<const Type *, 8> *elementTypes,
|
||||
llvm::SmallVector<std::string, 8> *elementNames,
|
||||
llvm::SmallVector<SourcePos, 8> *elementPositions) {
|
||||
std::set<std::string> seenNames;
|
||||
for (unsigned int i = 0; i < sd.size(); ++i) {
|
||||
const Type *type = sd[i]->type;
|
||||
if (type == NULL)
|
||||
continue;
|
||||
|
||||
// FIXME: making this fake little DeclSpecs here is really
|
||||
// disgusting
|
||||
DeclSpecs ds(type);
|
||||
if (type->IsUniformType())
|
||||
ds.typeQualifier |= TYPEQUAL_UNIFORM;
|
||||
else
|
||||
ds.typeQualifier |= TYPEQUAL_VARYING;
|
||||
if (Type::Equal(type, AtomicType::Void) == false) {
|
||||
if (type->IsUniformType())
|
||||
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
|
||||
else if (type->IsVaryingType())
|
||||
ds.typeQualifiers |= TYPEQUAL_VARYING;
|
||||
else if (type->GetSOAWidth() != 0)
|
||||
ds.soaWidth = type->GetSOAWidth();
|
||||
// FIXME: ds.vectorSize?
|
||||
}
|
||||
|
||||
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
|
||||
Declarator *d = (*sd[i]->declarators)[j];
|
||||
d->InitFromDeclSpecs(&ds);
|
||||
|
||||
// if it's an unsized array, make it a reference to an unsized
|
||||
// array, so the caller can pass a pointer...
|
||||
const ArrayType *at = dynamic_cast<const ArrayType *>(d->sym->type);
|
||||
if (at && at->GetElementCount() == 0)
|
||||
d->sym->type = new ReferenceType(d->sym->type, type->IsConstType());
|
||||
if (Type::Equal(d->type, AtomicType::Void))
|
||||
Error(d->pos, "\"void\" type illegal for struct member.");
|
||||
|
||||
elementTypes->push_back(d->sym->type);
|
||||
elementNames->push_back(d->sym->name);
|
||||
elementPositions->push_back(d->sym->pos);
|
||||
elementTypes->push_back(d->type);
|
||||
|
||||
if (seenNames.find(d->name) != seenNames.end())
|
||||
Error(d->pos, "Struct member \"%s\" has same name as a "
|
||||
"previously-declared member.", d->name.c_str());
|
||||
else
|
||||
seenNames.insert(d->name);
|
||||
|
||||
elementNames->push_back(d->name);
|
||||
elementPositions->push_back(d->pos);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
|
||||
const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
|
||||
|
||||
if (arrayType != NULL && arrayType->GetElementCount() == 0)
|
||||
Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
|
||||
"for the last member in a struct definition.");
|
||||
}
|
||||
}
|
||||
|
||||
136
decl.h
136
decl.h
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -47,24 +47,20 @@
|
||||
variables--here, that the declaration has the 'static' and 'uniform'
|
||||
qualifiers, and that it's basic type is 'int'. Then for each variable
|
||||
declaration, the Declaraiton class holds an instance of a Declarator,
|
||||
which in turn records the per-variable information like the symbol
|
||||
name, array size (if any), initializer expression, etc.
|
||||
which in turn records the per-variable information like the name, array
|
||||
size (if any), initializer expression, etc.
|
||||
*/
|
||||
|
||||
#ifndef ISPC_DECL_H
|
||||
#define ISPC_DECL_H
|
||||
|
||||
#include "ispc.h"
|
||||
#include <llvm/ADT/SmallVector.h>
|
||||
|
||||
enum StorageClass {
|
||||
SC_NONE,
|
||||
SC_EXTERN,
|
||||
SC_EXPORT,
|
||||
SC_STATIC,
|
||||
SC_TYPEDEF,
|
||||
SC_EXTERN_C
|
||||
};
|
||||
struct VariableDeclaration;
|
||||
|
||||
class Declaration;
|
||||
class Declarator;
|
||||
|
||||
/* Multiple qualifiers can be provided with types in declarations;
|
||||
therefore, they are set up so that they can be ANDed together into an
|
||||
@@ -74,9 +70,11 @@ enum StorageClass {
|
||||
#define TYPEQUAL_UNIFORM (1<<1)
|
||||
#define TYPEQUAL_VARYING (1<<2)
|
||||
#define TYPEQUAL_TASK (1<<3)
|
||||
#define TYPEQUAL_REFERENCE (1<<4)
|
||||
#define TYPEQUAL_SIGNED (1<<4)
|
||||
#define TYPEQUAL_UNSIGNED (1<<5)
|
||||
#define TYPEQUAL_INLINE (1<<6)
|
||||
#define TYPEQUAL_EXPORT (1<<7)
|
||||
#define TYPEQUAL_UNMASKED (1<<8)
|
||||
|
||||
/** @brief Representation of the declaration specifiers in a declaration.
|
||||
|
||||
@@ -85,22 +83,25 @@ enum StorageClass {
|
||||
*/
|
||||
class DeclSpecs {
|
||||
public:
|
||||
DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
|
||||
DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
|
||||
int tq = TYPEQUAL_NONE);
|
||||
|
||||
void Print() const;
|
||||
|
||||
StorageClass storageClass;
|
||||
|
||||
/** Zero or more of the TYPEQUAL_* values, ANDed together. */
|
||||
int typeQualifier;
|
||||
int typeQualifiers;
|
||||
|
||||
/** The basic type provided in the declaration; this should be an
|
||||
AtomicType, a StructType, or a VectorType; other types (like
|
||||
AtomicType, EnumType, StructType, or VectorType; other types (like
|
||||
ArrayTypes) will end up being created if a particular declaration
|
||||
has an array size, etc.
|
||||
*/
|
||||
const Type *baseType;
|
||||
|
||||
const Type *GetBaseType(SourcePos pos) const;
|
||||
|
||||
/** If this is a declaration with a vector type, this gives the vector
|
||||
width. For non-vector types, this is zero.
|
||||
*/
|
||||
@@ -110,9 +111,19 @@ public:
|
||||
SOA width specified. Otherwise this is zero.
|
||||
*/
|
||||
int soaWidth;
|
||||
|
||||
std::vector<std::pair<std::string, SourcePos> > declSpecList;
|
||||
};
|
||||
|
||||
|
||||
enum DeclaratorKind {
|
||||
DK_BASE,
|
||||
DK_POINTER,
|
||||
DK_REFERENCE,
|
||||
DK_ARRAY,
|
||||
DK_FUNCTION
|
||||
};
|
||||
|
||||
/** @brief Representation of the declaration of a single variable.
|
||||
|
||||
In conjunction with an instance of the DeclSpecs, this gives us
|
||||
@@ -120,35 +131,53 @@ public:
|
||||
*/
|
||||
class Declarator {
|
||||
public:
|
||||
Declarator(Symbol *s, SourcePos p);
|
||||
|
||||
/** As the parser peels off array dimension declarations after the
|
||||
symbol name, it calls this method to provide them to the
|
||||
Declarator.
|
||||
*/
|
||||
void AddArrayDimension(int size);
|
||||
Declarator(DeclaratorKind dk, SourcePos p);
|
||||
|
||||
/** Once a DeclSpecs instance is available, this method completes the
|
||||
initialization of the Symbol, setting its Type accordingly.
|
||||
initialization of the type member.
|
||||
*/
|
||||
void InitFromDeclSpecs(DeclSpecs *ds);
|
||||
|
||||
/** Get the actual type of the combination of Declarator and the given
|
||||
DeclSpecs */
|
||||
const Type *GetType(DeclSpecs *ds) const;
|
||||
void InitFromType(const Type *base, DeclSpecs *ds);
|
||||
|
||||
void Print() const;
|
||||
void Print(int indent) const;
|
||||
|
||||
/** Position of the declarator in the source program. */
|
||||
const SourcePos pos;
|
||||
Symbol *sym;
|
||||
/** If this declarator includes an array specification, the sizes of
|
||||
the array dimensions are represented here.
|
||||
*/
|
||||
std::vector<int> arraySize;
|
||||
|
||||
/** The kind of this declarator; complex declarations are assembled as
|
||||
a hierarchy of Declarators. (For example, a pointer to an int
|
||||
would have a root declarator with kind DK_POINTER and with the
|
||||
Declarator::child member pointing to a DK_BASE declarator for the
|
||||
int). */
|
||||
const DeclaratorKind kind;
|
||||
|
||||
/** Child pointer if needed; this can only be non-NULL if the
|
||||
declarator's kind isn't DK_BASE. */
|
||||
Declarator *child;
|
||||
|
||||
/** Type qualifiers provided with the declarator. */
|
||||
int typeQualifiers;
|
||||
|
||||
StorageClass storageClass;
|
||||
|
||||
/** For array declarators, this gives the declared size of the array.
|
||||
Unsized arrays have arraySize == 0. */
|
||||
int arraySize;
|
||||
|
||||
/** Name associated with the declarator. */
|
||||
std::string name;
|
||||
|
||||
/** Initialization expression for the variable. May be NULL. */
|
||||
Expr *initExpr;
|
||||
bool isFunction;
|
||||
std::vector<Declaration *> *functionArgs;
|
||||
|
||||
/** Type of the declarator. This is NULL until InitFromDeclSpecs() or
|
||||
InitFromType() is called. */
|
||||
const Type *type;
|
||||
|
||||
/** For function declarations, this holds the Declaration *s for the
|
||||
function's parameters. */
|
||||
std::vector<Declaration *> functionParams;
|
||||
};
|
||||
|
||||
|
||||
@@ -157,26 +186,21 @@ public:
|
||||
*/
|
||||
class Declaration {
|
||||
public:
|
||||
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL) {
|
||||
declSpecs = ds;
|
||||
if (dlist != NULL)
|
||||
declarators = *dlist;
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i)
|
||||
if (declarators[i] != NULL)
|
||||
declarators[i]->InitFromDeclSpecs(declSpecs);
|
||||
}
|
||||
Declaration(DeclSpecs *ds, Declarator *d) {
|
||||
declSpecs = ds;
|
||||
if (d) {
|
||||
d->InitFromDeclSpecs(ds);
|
||||
declarators.push_back(d);
|
||||
}
|
||||
}
|
||||
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
|
||||
Declaration(DeclSpecs *ds, Declarator *d);
|
||||
|
||||
/** Adds the symbols for the variables in the declaration to the symbol
|
||||
table. */
|
||||
void AddSymbols(SymbolTable *st) const;
|
||||
void Print() const;
|
||||
void Print(int indent) const;
|
||||
|
||||
/** This method walks through all of the Declarators in a declaration
|
||||
and returns a fully-initialized Symbol and (possibly) and
|
||||
initialization expression for each one. (This allows the rest of
|
||||
the system to not have to worry about the mess of the general
|
||||
Declarator representation.) */
|
||||
std::vector<VariableDeclaration> GetVariableDeclarations() const;
|
||||
|
||||
/** For any function declarations in the Declaration, add the
|
||||
declaration to the module. */
|
||||
void DeclareFunctions();
|
||||
|
||||
DeclSpecs *declSpecs;
|
||||
std::vector<Declarator *> declarators;
|
||||
@@ -197,8 +221,8 @@ struct StructDeclaration {
|
||||
/** Given a set of StructDeclaration instances, this returns the types of
|
||||
the elements of the corresponding struct and their names. */
|
||||
extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions);
|
||||
llvm::SmallVector<const Type *, 8> *elementTypes,
|
||||
llvm::SmallVector<std::string, 8> *elementNames,
|
||||
llvm::SmallVector<SourcePos, 8> *elementPositions);
|
||||
|
||||
#endif // ISPC_DECL_H
|
||||
|
||||
@@ -1,3 +1,450 @@
|
||||
=== v1.3.0 === (29 June 2012)
|
||||
|
||||
This is a major new release of ispc, with support for more compilation
|
||||
targets and a number of additions to the language. As usual, the quality
|
||||
of generated code has also been improved in a number of cases and a number
|
||||
of small bugs have been fixed.
|
||||
|
||||
New targets:
|
||||
|
||||
* This release provides "beta" support for compiling to Intel Xeon Phi (the
|
||||
"Many Integrated Core" arthiecture). See
|
||||
http://ispc.github.com/ispc.html#compiling-for-the-intel-xeon-phi-architecture
|
||||
for more details on this support.
|
||||
|
||||
* This release also has an "avx1.1" target, which provides support for the
|
||||
new instructions in the Intel Ivy Bridge microarchitecutre.
|
||||
|
||||
New language features:
|
||||
|
||||
* The foreach_active statement allows iteration over the active program
|
||||
instances in a gang. (See
|
||||
http://ispc.github.com/ispc.html#iteration-over-active-program-instances-foreach-active)
|
||||
|
||||
* foreach_unique allows iterating over subsets of program instances in a
|
||||
gang that share the same value of a variable. (See
|
||||
http://ispc.github.com/ispc.html#iteration-over-unique-elements-foreach-unique)
|
||||
|
||||
* An "unmasked" function qualifier and statement in the language allow
|
||||
re-activating execution of all program instances in a gang. (See
|
||||
http://ispc.github.com/ispc.html#re-establishing-the-execution-mask
|
||||
|
||||
Standard library updates:
|
||||
|
||||
* The seed_rng() function has been modified to take a "varying" seed value
|
||||
when a varying RNGState is being initialized.
|
||||
|
||||
* An isnan() function has been added, to check for floating-point "not a
|
||||
number" values.
|
||||
|
||||
* The float_to_srgb8() routine does high performance conversion of
|
||||
floating-point color values to SRGB8 format.
|
||||
|
||||
Other changes:
|
||||
|
||||
* A number of bugfixes have been made for compiler crashes with malformed
|
||||
programs.
|
||||
|
||||
* Floating-point comparisons are now "unordered", so that any comparison
|
||||
where one of the operands is a "not a number" value returns false. (This
|
||||
matches standard IEEE floating-point behavior.)
|
||||
|
||||
* The code generated for 'break' statements in "varying" loops has been
|
||||
improved for some common cases.
|
||||
|
||||
* Compile time and compiler memory use have both been improved,
|
||||
particularly for large input programs.
|
||||
|
||||
* A nubmer of bugs have been fixed in the debugging information generated
|
||||
by the compiler when the "-g" command-line flag is used.
|
||||
|
||||
=== v1.2.2 === (20 April 2012)
|
||||
|
||||
This release includes a number of small additions to functionality and a
|
||||
number of bugfixes. New functionality includes:
|
||||
|
||||
* It's now possible to forward declare structures as in C/C++: "struct
|
||||
Foo;". After such a declaration, structs with pointers to "Foo" and
|
||||
functions that take pointers or references to Foo structs can be declared
|
||||
without the entire definition of Foo being available.
|
||||
|
||||
* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
|
||||
corresponding to the equivalent types in C.
|
||||
|
||||
* The standard library now provides atomic_swap*() and
|
||||
atomic_compare_exchange*() functions for void * types.
|
||||
|
||||
* The C++ backend has seen a number of improvements to the quality and
|
||||
readability of generated code.
|
||||
|
||||
A number of bugs have been fixed in this release as well. The most
|
||||
significant are:
|
||||
|
||||
* Fixed a bug where nested loops could cause a compiler crash in some
|
||||
circumstances (issues #240, and #229)
|
||||
|
||||
* Gathers could access invlaid mamory (and cause the program to crash) in
|
||||
some circumstances (#235)
|
||||
|
||||
* References to temporary values are now handled properly when passed to a
|
||||
function that takes a reference typed parameter.
|
||||
|
||||
* A case where incorrect code could be generated for compile-time-constant
|
||||
initializers has been fixed (#234).
|
||||
|
||||
=== v1.2.1 === (6 April 2012)
|
||||
|
||||
This release contains only minor new functionality and is mostly for many
|
||||
small bugfixes and improvements to error handling and error reporting.
|
||||
The new functionality that is present is:
|
||||
|
||||
* Significantly more efficient versions of the float / half conversion
|
||||
routines are now available in the standard library, thanks to Fabian
|
||||
Giesen.
|
||||
|
||||
* The last member of a struct can now be a zero-length array; this allows
|
||||
the trick of dynamically allocating enough storage for the struct and
|
||||
some number of array elements at the end of it.
|
||||
|
||||
Significant bugs fixed include:
|
||||
|
||||
* Issue #205: When a target ISA isn't specified, use the host system's
|
||||
capabilities to choose a target for which it will be able to run the
|
||||
generated code.
|
||||
|
||||
* Issues #215 and #217: Don't allocate storage for global variables that
|
||||
are declared "extern".
|
||||
|
||||
* Issue #197: Allow NULL as a default argument value in a function
|
||||
declaration.
|
||||
|
||||
* Issue #223: Fix bugs where taking the address of a function wouldn't work
|
||||
as expected.
|
||||
|
||||
* Issue #224: When there are overloaded variants of a function that take
|
||||
both reference and const reference parameters, give the non-const
|
||||
reference preference when matching values of that underlying type.
|
||||
|
||||
* Issue #225: An error is issed when a varying lvalue is assigned to a
|
||||
reference type (rather than crashing).
|
||||
|
||||
* Issue #193: Permit conversions from array types to void *, not just the
|
||||
pointer type of the underlying array element.
|
||||
|
||||
* Issue #199: Still evaluate expressions that are cast to (void).
|
||||
|
||||
The documentation has also been improved, with FAQs added to clarify some
|
||||
aspects of the ispc pointer model.
|
||||
|
||||
=== v1.2.0 === (20 March 2012)
|
||||
|
||||
This is a major new release of ispc, with a number of significant
|
||||
improvements to functionality, performance, and compiler robustness. It
|
||||
does, however, include three small changes to language syntax and semantics
|
||||
that may require changes to existing programs:
|
||||
|
||||
* Syntax for the "launch" keyword has been cleaned up; it's now no longer
|
||||
necessary to bracket the launched function call with angle brackets.
|
||||
(In other words, now use "launch foo();", rather than "launch < foo() >;".
|
||||
|
||||
* When using pointers, the pointed-to data type is now "uniform" by
|
||||
default. Use the varying keyword to specify varying pointed-to types when
|
||||
needed. (i.e. "float *ptr" is a varying pointer to uniform float data,
|
||||
whereas previously it was a varying pointer to varying float values.)
|
||||
Use "varying float *" to specify a varying pointer to varying float data,
|
||||
and so forth.
|
||||
|
||||
* The details of "uniform" and "varying" and how they interact with struct
|
||||
types have been cleaned up. Now, when a struct type is declared, if the
|
||||
struct elements don't have explicit "uniform" or "varying" qualifiers,
|
||||
they are said to have "unbound" variability. When a struct type is
|
||||
instantiated, any unbound variability elements inherit the variability of
|
||||
the parent struct type. See http://ispc.github.com/ispc.html#struct-types
|
||||
for more details.
|
||||
|
||||
ispc has a new language feature that makes it much easier to use the
|
||||
efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
|
||||
data. A new "soa<n>" qualifier can be applied to structure types to
|
||||
specify an n-wide SoA version of the corresponding type. Array indexing
|
||||
and pointer operations with arrays SoA types automatically handles the
|
||||
two-stage indexing calculation to access the data. See
|
||||
http://ispc.github.com/ispc.html#structure-of-array-types for more details.
|
||||
|
||||
For more efficient access of data that is still in "array of structures"
|
||||
(AoS) format, ispc has a new "memory coalescing" optimization that
|
||||
automatically detects series of strided loads and/or gathers that can be
|
||||
transformed into a more efficient set of vector loads and shuffles. A
|
||||
diagnostic is emitted when this optimization is successfully applied.
|
||||
|
||||
Smaller changes in this release:
|
||||
|
||||
* The standard library now provides memcpy(), memmove() and memset()
|
||||
functions, as well as single-precision asin() and acos() functions.
|
||||
|
||||
* -I can now be specified on the command-line to specify a search path for
|
||||
#include files.
|
||||
|
||||
* A number of improvements have been made to error reporting from the
|
||||
parser, and a number of cases where malformed programs could cause the
|
||||
compiler to crash have been fixed.
|
||||
|
||||
* A number of small improvements to the quality and performance of generated
|
||||
code have been made, including finding more cases where 32-bit addressing
|
||||
calculations can be safely done on 64-bit systems and generating better
|
||||
code for initializer expressions.
|
||||
|
||||
=== v1.1.4 === (4 February 2012)
|
||||
|
||||
There are two major bugfixes for Windows in this release. First, a number
|
||||
of failures in AVX code generation on Windows have been fixed; AVX on
|
||||
Windows now has no known issues. Second, a longstanding bug in parsing 64-bit
|
||||
integer constants on Windows has been fixed.
|
||||
|
||||
This release features a new experimental scalar target, contributed by Gabe
|
||||
Weisz <gweisz@cs.cmu.edu>. This target ("--target=generic-1") compiles
|
||||
gangs of single program instances (i.e. programCount == 1); it can be
|
||||
useful for debugging ispc programs.
|
||||
|
||||
The compiler now supports dynamic memory allocation in ispc programs (with
|
||||
"new" and "delete" operators based on C++). See
|
||||
http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
|
||||
documentation for more information.
|
||||
|
||||
ispc now performs "short circuit" evaluation of the || and && logical
|
||||
operators and the ? : selection operator. (This represents the correction
|
||||
of a major incompatibility with C.) Code like "(index < arraySize &&
|
||||
array[index] == 1)" thus now executes as in C, where "array[index]" won't
|
||||
be evaluated unless "index" is less than "arraySize".
|
||||
|
||||
The standard library now provides "local" atomic operations, which are
|
||||
atomic across the gang of program instances (but not across other gangs or
|
||||
other hardware threads. See the updated documentation on atomics for more
|
||||
information:
|
||||
http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
|
||||
|
||||
The standard library now offers a clock() function, which returns a uniform
|
||||
int64 value that counts processor cycles; it can be used for
|
||||
fine-resolution timing measurements.
|
||||
|
||||
Finally (of limited interest now): ispc now supports the forthcoming AVX2
|
||||
instruction set, due with Haswell-generation CPUs. All tests and examples
|
||||
compile and execute correctly with AVX2. (Thanks specifically to Craig
|
||||
Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
|
||||
possible.)
|
||||
|
||||
=== v1.1.3 === (20 January 2012)
|
||||
|
||||
With this release, the language now supports "switch" statements, with the
|
||||
same semantics and syntax as in C.
|
||||
|
||||
This release includes fixes for two important performance related issues:
|
||||
the quality of code generated for "foreach" statements has been
|
||||
substantially improved (https://github.com/ispc/ispc/issues/151), and a
|
||||
performance regression with code for "gathers" that was introduced in
|
||||
v1.1.2 has been fixed in this release.
|
||||
|
||||
A number of other small bugs were fixed in this release as well, including
|
||||
one where invalid memory would sometimes be incorrectly accessed
|
||||
(https://github.com/ispc/ispc/issues/160).
|
||||
|
||||
Thanks to Jean-Luc Duprat for a number of patches that improve support for
|
||||
building on various platforms, and to Pierre-Antoine Lacaze for patches so
|
||||
that ispc builds under MinGW.
|
||||
|
||||
=== v1.1.2 === (9 January 2012)
|
||||
|
||||
The major new feature in this release is support for "generic" C++
|
||||
vectorized output; in other words, ispc can emit C++ code that corresponds
|
||||
to the vectorized computation that the ispc program represents. See the
|
||||
examples/intrinsics directory in the ispc distribution for two example
|
||||
implementations of the set of functions that must be provided map the
|
||||
vector calls generated by ispc to target specific functions.
|
||||
|
||||
ispc now has partial support for 'goto' statements; specifically, goto is
|
||||
allowed if any enclosing control flow statements (if/for/while/do) have
|
||||
'uniform' test expressions, but not if they have 'varying' tests.
|
||||
|
||||
A number of improvements have been made to the code generated for gathers
|
||||
and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
|
||||
addressing calculations) improved the performance of the noise example by
|
||||
14%.
|
||||
|
||||
Many small bugs have been fixed in this release as well, including issue
|
||||
numbers 138, 129, 135, 127, 149, and 142.
|
||||
|
||||
=== v1.1.1 === (15 December 2011)
|
||||
|
||||
This release doesn't include any significant new functionality, but does
|
||||
include a small improvements in generated code and a number of bug fixes.
|
||||
|
||||
The one user-visible language change is that integer constants may be
|
||||
specified with 'u' and 'l' suffixes, like in C. For example, "1024llu"
|
||||
defines the constant with unsigned 64-bit type.
|
||||
|
||||
More informative and useful error messages are printed when function
|
||||
overload resolution fails.
|
||||
|
||||
Masking is avoided in additional cases when the mask can be
|
||||
statically-determined to be all on.
|
||||
|
||||
A number of small bugs have been fixed:
|
||||
- Under some circumstances, incorrect masks were used when assigning a
|
||||
value to a reference and when doing gathers/scatters.
|
||||
- Incorrect code could be generated in some cases when some instances
|
||||
returned part way through a function but others contineud executing.
|
||||
- Type checking wasn't being performed for calls through function pointers;
|
||||
now an error is issued if the arguments don't match up, etc.
|
||||
- Incorrect code was being generated for gather/scatter to structs that had
|
||||
elements with varying short-vector types.
|
||||
- Typechecking wasn't being performed for "foreach" statements; this led to
|
||||
problems like function overload resolution not being performed if an
|
||||
overloaded function call was used to determine the iteration range..
|
||||
- A number of symbols would be multiply-defined when compiling to multiple
|
||||
targets and using the sse2-x2 target as one of them (issue #131).
|
||||
|
||||
=== v1.1.0 === (5 December 2011)
|
||||
|
||||
This is a major new release of the compiler, with significant additions to
|
||||
language functionality and capabilities. It includes a number of small
|
||||
language syntax changes that will require modification of existing
|
||||
programs. These changes should generally be straightforward and all are
|
||||
steps toward eliminating parts of ispc syntax that are incompatible with
|
||||
C/C++. See
|
||||
http://ispc.github.com/ispc.html#updating-ispc-programs-for-changes-in-ispc-1-1
|
||||
for more information about these changes.
|
||||
|
||||
ispc now fully supports pointers, including pointer arithmetic, implicit
|
||||
conversions of arrays to pointers, and all of the other capabilities of
|
||||
pointers in C. See http://ispc.github.com/ispc.html#pointer-types for more
|
||||
information about pointers in ispc and
|
||||
http://ispc.github.com/ispc.html#function-pointer-types for information
|
||||
about function pointers in ispc.
|
||||
|
||||
Reference types are now declared with C++ syntax (e.g. "const float &foo").
|
||||
|
||||
ispc now supports 64-bit addressing. For performance reasons, this
|
||||
capability is disabled by default (even on 64-bit targets), but can be
|
||||
enabled with a command-line flag:
|
||||
http://ispc.github.com/ispc.html#selecting-32-or-64-bit-addressing.
|
||||
|
||||
This release features new parallel "foreach" statements, which make it
|
||||
easier in many instances to map program instances to data for data-parallel
|
||||
computation than the programIndex/programCount mechanism:
|
||||
http://ispc.github.com/ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled.
|
||||
|
||||
Finally, all of the system's documentation has been significantly revised.
|
||||
The documentation of ispc's parallel execution model has been rewritten:
|
||||
http://ispc.github.com/ispc.html#the-ispc-parallel-execution-model, and
|
||||
there is now a more specific discussion of similarities and differences
|
||||
between ispc and C/C++:
|
||||
http://ispc.github.com/ispc.html#relationship-to-the-c-programming-language.
|
||||
There is now a separate FAQ (http://ispc.github.com/faq.html), and a
|
||||
Performance Guide (http://ispc.github.com/perfguide.html).
|
||||
|
||||
=== v1.0.12 === (20 October 2011)
|
||||
|
||||
This release includes a new "double-pumped" 8-wide target for SSE2,
|
||||
"sse2-x2". Like the sse4-x2 and avx-x2 targets, this target may deliver
|
||||
higher performance for some workloads than the regular sse2 target. (For
|
||||
other workloads, it may be slower.)
|
||||
|
||||
The ispc language now includes an "assert()" statement. See
|
||||
http://ispc.github.com/ispc.html#assertions for more information.
|
||||
|
||||
The compiler now sets a preprocessor #define based on the target ISA; for
|
||||
example, ISPC_TARGET_SSE4 is defined for the sse4 targets, and so forth.
|
||||
|
||||
The standard library now provides high-performance routines for converting
|
||||
between some "array of structures" and "structure of arrays" formats.
|
||||
See
|
||||
http://ispc.github.com/ispc.html#converting-between-array-of-structures-and-structure-of-arrays-layout
|
||||
for more information.
|
||||
|
||||
Inline functions now have static linkage.
|
||||
|
||||
A number of improvements have been made to the optimization passes that
|
||||
detect when gathers and scatters can be transformed into vector stores and
|
||||
loads, respectively. In particular, these passes now handle variables that
|
||||
are used as loop induction variables much better.
|
||||
|
||||
=== v1.0.11 === (6 October 2011)
|
||||
|
||||
The main new feature in this release is support for generating code for
|
||||
multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
|
||||
select the best variant at execution time. For more information, see
|
||||
http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
|
||||
|
||||
All of the examples now take advantage of the support for multiple
|
||||
compilation targets; thus, if one has an AVX system, it's not necessary to
|
||||
recompile the examples to use the AVX target.
|
||||
|
||||
Performance of the built-in task system that is used in the examples has
|
||||
been improved.
|
||||
|
||||
Finally, the print() statement now works on OSX; it had been broken for the
|
||||
last few releases.
|
||||
|
||||
=== v1.0.10 === (30 September 2011)
|
||||
|
||||
This release features an extensive new example showing the application of
|
||||
ispc to a deferred shading algorithm for scenes with thousands of lights
|
||||
(examples/deferred). This is an implementation of the algorithm that Johan
|
||||
Andersson described at SIGGRAPH 2009 and was implemented by Andrew
|
||||
Lauritzen and Jefferson Montgomery. The basic idea is that a pre-rendered
|
||||
G-buffer is partitioned into tiles, and in each tile, the set of lights
|
||||
that contribute to the tile is computed. Then, the pixels in the tile are
|
||||
then shaded using those light sources. (See slides 19-29 of
|
||||
http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
|
||||
for more details on the algorithm.)
|
||||
|
||||
The mechanism for launching tasks from ispc code has been generalized to
|
||||
allow multiple tasks to be launched with a single launch call (see
|
||||
http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
|
||||
information.)
|
||||
|
||||
A few new functions have been added to the standard library: num_cores()
|
||||
returns the number of cores in the system's CPU, and variants of all of the
|
||||
atomic operators that take 'uniform' values as parameters have been added.
|
||||
|
||||
=== v1.0.9 === (26 September 2011)
|
||||
|
||||
The binary release of v1.0.9 is the first that supports AVX code
|
||||
generation. Two targets are provided: "avx", which runs with a
|
||||
programCount of 8, and "avx-x2" which runs 16 program instances
|
||||
simultaneously. (This binary is also built using the in-progress LLVM 3.0
|
||||
development libraries, while previous ones have been built with the
|
||||
released 2.9 version of LLVM.)
|
||||
|
||||
This release has no other significant changes beyond a number of small
|
||||
bugfixes (https://github.com/ispc/ispc/issues/100,
|
||||
https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
|
||||
|
||||
=== v1.0.8 === (19 September 2011)
|
||||
|
||||
A number of improvements have been made to handling of 'if' statements in
|
||||
the language:
|
||||
- A bug was fixed where invalid memory could be incorrectly accessed even
|
||||
if none of the running program instances wanted to execute the
|
||||
corresponding instructions (https://github.com/ispc/ispc/issues/74).
|
||||
- The code generated for 'if' statements is a bit simpler and thus more
|
||||
efficient.
|
||||
|
||||
There is now '--pic' command-line argument that causes position-independent
|
||||
code to be generated (Linux and OSX only).
|
||||
|
||||
A number of additional performance improvements:
|
||||
- Loops are now unrolled by default; the --opt=disable-loop-unroll
|
||||
command-line argument can be used to disable this behavior.
|
||||
(https://github.com/ispc/ispc/issues/78)
|
||||
- A few more cases where gathers/scatters could be determined at compile
|
||||
time to actually access contiguous locations have been added.
|
||||
(https://github.com/ispc/ispc/issues/79)
|
||||
|
||||
Finally, warnings are now issued (if possible) when it can be determined
|
||||
at compile-time that an out-of-bounds array index is being used.
|
||||
(https://github.com/ispc/ispc/issues/98).
|
||||
|
||||
|
||||
=== v1.0.7 === (3 September 2011)
|
||||
|
||||
The various atomic_*_global() standard library functions are generally
|
||||
|
||||
@@ -1,6 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
rst2html.py ispc.txt > ispc.html
|
||||
for i in ispc perfguide faq; do
|
||||
rst2html.py --template=template.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css $i.rst > $i.html
|
||||
done
|
||||
|
||||
rst2html.py --template=template-news.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css news.rst > news.html
|
||||
|
||||
rst2html.py --template=template-perf.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css perf.rst > perf.html
|
||||
|
||||
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
|
||||
#pdflatex ispc.tex
|
||||
|
||||
825
docs/faq.rst
Normal file
825
docs/faq.rst
Normal file
@@ -0,0 +1,825 @@
|
||||
=====================================
|
||||
Frequently Asked Questions About ispc
|
||||
=====================================
|
||||
|
||||
This document includes a number of frequently (and not frequently) asked
|
||||
questions about ispc, the Intel® SPMD Program Compiler. The source to this
|
||||
document is in the file ``docs/faq.rst`` in the ``ispc`` source
|
||||
distribution.
|
||||
|
||||
* Understanding ispc's Output
|
||||
|
||||
+ `How can I see the assembly language generated by ispc?`_
|
||||
+ `How can I have the assembly output be printed using Intel assembly syntax?`_
|
||||
+ `Why are there multiple versions of exported ispc functions in the assembly output?`_
|
||||
+ `How can I more easily see gathers and scatters in generated assembly?`_
|
||||
|
||||
* Language Details
|
||||
|
||||
+ `What is the difference between "int *foo" and "int foo[]"?`_
|
||||
+ `Why are pointed-to types "uniform" by default?`_
|
||||
+ `What am I getting an error about assigning a varying lvalue to a reference type?`_
|
||||
|
||||
* Interoperability
|
||||
|
||||
+ `How can I supply an initial execution mask in the call from the application?`_
|
||||
+ `How can I generate a single binary executable with support for multiple instruction sets?`_
|
||||
+ `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
|
||||
+ `Is it possible to inline ispc functions in C/C++ code?`_
|
||||
+ `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_
|
||||
|
||||
* Programming Techniques
|
||||
|
||||
+ `What primitives are there for communicating between SPMD program instances?`_
|
||||
+ `How can a gang of program instances generate variable amounts of output efficiently?`_
|
||||
+ `Is it possible to use ispc for explicit vector programming?`_
|
||||
+ `How can I debug my ispc programs using Valgrind?`_
|
||||
+ `foreach statements generate more complex assembly than I'd expect; what's going on?`_
|
||||
+ `How do I launch an individual task for each active program instance?`_
|
||||
|
||||
Understanding ispc's Output
|
||||
===========================
|
||||
|
||||
How can I see the assembly language generated by ispc?
|
||||
------------------------------------------------------
|
||||
|
||||
The ``--emit-asm`` flag causes assembly output to be generated. If the
|
||||
``-o`` command-line flag is also supplied, the assembly is stored in the
|
||||
given file, or printed to standard output if ``-`` is specified for the
|
||||
filename. For example, given the simple ``ispc`` program:
|
||||
|
||||
::
|
||||
|
||||
export uniform int foo(uniform int a, uniform int b) {
|
||||
return a+b;
|
||||
}
|
||||
|
||||
If the SSE4 target is used, then the following assembly is printed:
|
||||
|
||||
::
|
||||
|
||||
_foo:
|
||||
addl %esi, %edi
|
||||
movl %edi, %eax
|
||||
ret
|
||||
|
||||
|
||||
How can I have the assembly output be printed using Intel assembly syntax?
|
||||
--------------------------------------------------------------------------
|
||||
|
||||
The ``ispc`` compiler is currently only able to emit assembly with AT+T
|
||||
syntax, where the destination operand is the last operand after an
|
||||
instruction. If you'd prefer Intel assembly output, one option is to use
|
||||
Agner Fog's ``objconv`` tool: have ``ispc`` emit a native object file and
|
||||
then use ``objconv`` to disassemble it, specifying the assembler syntax
|
||||
that you prefer. ``objconv`` `is available for download here`_.
|
||||
|
||||
.. _is available for download here: http://www.agner.org/optimize/#objconv
|
||||
|
||||
Why are there multiple versions of exported ispc functions in the assembly output?
|
||||
----------------------------------------------------------------------------------
|
||||
|
||||
Two generations of all functions qualified with ``export`` are generated:
|
||||
one of them is for being be called by other ``ispc`` functions, and the
|
||||
other is to be called by the application. The application callable
|
||||
function has the original function's name, while the ``ispc``-callable
|
||||
function has a mangled name that encodes the types of the function's
|
||||
parameters.
|
||||
|
||||
The crucial difference between these two functions is that the
|
||||
application-callable function doesn't take a parameter encoding the current
|
||||
execution mask, while ``ispc``-callable functions have a hidden mask
|
||||
parameter. An implication of this difference is that the ``export``
|
||||
function starts with the execution mask "all on". This allows a number of
|
||||
improvements in the generated code, particularly on architectures that
|
||||
don't have support for masked load and store instructions.
|
||||
|
||||
As an example, consider this short function, which loads a vector's worth
|
||||
values from two arrays in memory, adds them, and writes the result to an
|
||||
output array.
|
||||
|
||||
::
|
||||
|
||||
export void foo(uniform float a[], uniform float b[],
|
||||
uniform float result[]) {
|
||||
float aa = a[programIndex], bb = b[programIndex];
|
||||
result[programIndex] = aa+bb;
|
||||
}
|
||||
|
||||
Here is the assembly code for the application-callable instance of the
|
||||
function.
|
||||
|
||||
::
|
||||
|
||||
_foo:
|
||||
movups (%rsi), %xmm1
|
||||
movups (%rdi), %xmm0
|
||||
addps %xmm1, %xmm0
|
||||
movups %xmm0, (%rdx)
|
||||
ret
|
||||
|
||||
|
||||
And here is the assembly code for the ``ispc``-callable instance of the
|
||||
function.
|
||||
|
||||
::
|
||||
|
||||
"_foo___uptr<Uf>uptr<Uf>uptr<Uf>":
|
||||
movmskps %xmm0, %eax
|
||||
cmpl $15, %eax
|
||||
je LBB0_3
|
||||
testl %eax, %eax
|
||||
jne LBB0_4
|
||||
ret
|
||||
LBB0_3:
|
||||
movups (%rsi), %xmm1
|
||||
movups (%rdi), %xmm0
|
||||
addps %xmm1, %xmm0
|
||||
movups %xmm0, (%rdx)
|
||||
ret
|
||||
LBB0_4:
|
||||
####
|
||||
#### Code elided; handle mixed mask case..
|
||||
####
|
||||
ret
|
||||
|
||||
There are a few things to notice in this code. First, the current program
|
||||
mask is coming in via the ``%xmm0`` register and the initial few
|
||||
instructions in the function essentially check to see if the mask is all on
|
||||
or all off. If the mask is all on, the code at the label LBB0_3 executes;
|
||||
it's the same as the code that was generated for ``_foo`` above. If the
|
||||
mask is all off, then there's nothing to be done, and the function can
|
||||
return immediately.
|
||||
|
||||
In the case of a mixed mask, a substantial amount of code is generated to
|
||||
load from and then store to only the array elements that correspond to
|
||||
program instances where the mask is on. (This code is elided below). This
|
||||
general pattern of having two-code paths for the "all on" and "mixed" mask
|
||||
cases is used in the code generated for almost all but the most simple
|
||||
functions (where the overhead of the test isn't worthwhile.)
|
||||
|
||||
How can I more easily see gathers and scatters in generated assembly?
|
||||
---------------------------------------------------------------------
|
||||
|
||||
Because CPU vector ISAs don't have native gather and scatter instructions,
|
||||
these memory operations are turned into sequences of a series of
|
||||
instructions in the code that ``ispc`` generates. In some cases, it can be
|
||||
useful to see where gathers and scatters actually happen in code; there is
|
||||
an otherwise undocumented command-line flag that provides this information.
|
||||
|
||||
Consider this simple program:
|
||||
|
||||
::
|
||||
|
||||
void set(uniform int a[], int value, int index) {
|
||||
a[index] = value;
|
||||
}
|
||||
|
||||
When compiled normally to the SSE4 target, this program generates this
|
||||
extensive code sequence, which makes it more difficult to see what the
|
||||
program is actually doing.
|
||||
|
||||
::
|
||||
|
||||
"_set___uptr<Ui>ii":
|
||||
pmulld LCPI0_0(%rip), %xmm1
|
||||
movmskps %xmm2, %eax
|
||||
testb $1, %al
|
||||
je LBB0_2
|
||||
movd %xmm1, %ecx
|
||||
movd %xmm0, (%rcx,%rdi)
|
||||
LBB0_2:
|
||||
testb $2, %al
|
||||
je LBB0_4
|
||||
pextrd $1, %xmm1, %ecx
|
||||
pextrd $1, %xmm0, (%rcx,%rdi)
|
||||
LBB0_4:
|
||||
testb $4, %al
|
||||
je LBB0_6
|
||||
pextrd $2, %xmm1, %ecx
|
||||
pextrd $2, %xmm0, (%rcx,%rdi)
|
||||
LBB0_6:
|
||||
testb $8, %al
|
||||
je LBB0_8
|
||||
pextrd $3, %xmm1, %eax
|
||||
pextrd $3, %xmm0, (%rax,%rdi)
|
||||
LBB0_8:
|
||||
ret
|
||||
|
||||
If this program is compiled with the
|
||||
``--opt=disable-handle-pseudo-memory-ops`` command-line flag, then the
|
||||
scatter is left as an unresolved function call. The resulting program
|
||||
won't link without unresolved symbols, but the assembly output is much
|
||||
easier to understand:
|
||||
|
||||
::
|
||||
|
||||
"_set___uptr<Ui>ii":
|
||||
movaps %xmm0, %xmm3
|
||||
pmulld LCPI0_0(%rip), %xmm1
|
||||
movdqa %xmm1, %xmm0
|
||||
movaps %xmm3, %xmm1
|
||||
jmp ___pseudo_scatter_base_offsets32_32 ## TAILCALL
|
||||
|
||||
|
||||
Language Details
|
||||
================
|
||||
|
||||
What is the difference between "int \*foo" and "int foo[]"?
|
||||
-----------------------------------------------------------
|
||||
|
||||
In C and C++, declaring a function to take a parameter ``int *foo`` and
|
||||
``int foo[]`` results in the same type for the parameter. Both are
|
||||
pointers to integers. In ``ispc``, these are different types. The first
|
||||
one is a varying pointer to a uniform integer value in memory, while the
|
||||
second results in a uniform pointer to the start of an array of varying
|
||||
integer values in memory.
|
||||
|
||||
To understand why the first is a varying pointer to a uniform integer,
|
||||
first recall that types without explicit rate qualifiers (``uniform``,
|
||||
``varying``, or ``soa<>``) are ``varying`` by default. Second, recall from
|
||||
the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
|
||||
types without rate qualifiers are ``uniform`` by default. (This second
|
||||
rule is discussed further below, in `Why are pointed-to types "uniform" by
|
||||
default?`_.) The type of ``int *foo`` follows from these.
|
||||
|
||||
.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types
|
||||
|
||||
Conversely, in a function body, ``int foo[10]`` represents a declaration of
|
||||
a 10-element array of varying ``int`` values. In that we'd certainly like
|
||||
to be able to pass such an array to a function that takes a ``int []``
|
||||
parameter, the natural type for an ``int []`` parameter is a uniform
|
||||
pointer to varying integer values.
|
||||
|
||||
In terms of compatibility with C/C++, it's unfortunate that this
|
||||
distinction exists, though any other set of rules seems to introduce more
|
||||
awkwardness than this one. (Though we're interested to hear ideas to
|
||||
improve these rules!).
|
||||
|
||||
Why are pointed-to types "uniform" by default?
|
||||
----------------------------------------------
|
||||
|
||||
In ``ispc``, types without rate qualifiers are "varying" by default, but
|
||||
types pointed to by pointers without rate qualifiers are "uniform" by
|
||||
default. Why this difference?
|
||||
|
||||
::
|
||||
|
||||
int foo; // no rate qualifier, "varying int".
|
||||
uniform int *foo; // pointer type has no rate qualifier, pointed-to does.
|
||||
// "varying pointer to uniform int".
|
||||
int *foo; // neither pointer type nor pointed-to type ("int") have
|
||||
// rate qualifiers. Pointer type is varying by default,
|
||||
// pointed-to is uniform. "varying pointer to uniform int".
|
||||
varying int *foo; // varying pointer to varying int
|
||||
|
||||
The first rule, having types without rate qualifiers be varying by default,
|
||||
is a default that keeps the number of "uniform" or "varying" qualifiers in
|
||||
``ispc`` programs low. Most ``ispc`` programs use mostly "varying"
|
||||
variables, so this rule allows most variables to be declared without also
|
||||
requiring rate qualifiers.
|
||||
|
||||
On a related note, this rule allows many C/C++ functions to be used to
|
||||
define equivalent functions in the SPMD execution model that ``ispc``
|
||||
provides with little or no modification:
|
||||
|
||||
::
|
||||
|
||||
// scalar add in C/C++, SPMD/vector add in ispc
|
||||
int add(int a, int b) { return a + b; }
|
||||
|
||||
This motivation also explains why ``uniform int *foo`` represents a varying
|
||||
pointer; having pointers be varying by default if they don't have rate
|
||||
qualifiers similarly helps with porting code from C/C++ to ``ispc``.
|
||||
|
||||
The tricker issue is why pointed-to types are "uniform" by default. In our
|
||||
experience, data in memory that is accessed via pointers is most often
|
||||
uniform; this generally includes all data that has been allocated and
|
||||
initialized by the C/C++ application code. In practice, "varying" types are
|
||||
more generally (but not exclusively) used for local data in ``ispc``
|
||||
functions. Thus, making the pointed-to type uniform by default leads to
|
||||
more concise code for the most common cases.
|
||||
|
||||
|
||||
What am I getting an error about assigning a varying lvalue to a reference type?
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Given code like the following:
|
||||
|
||||
::
|
||||
|
||||
uniform float a[...];
|
||||
int index = ...;
|
||||
float &r = a[index];
|
||||
|
||||
``ispc`` issues the error "Initializer for reference-type variable "r" must
|
||||
have a uniform lvalue type.". The underlying issue stems from how
|
||||
references are represented in the code generated by ``ispc``. Recall that
|
||||
``ispc`` supports both uniform and varying pointer types--a uniform pointer
|
||||
points to the same location in memory for all program instances in the
|
||||
gang, while a varying pointer allows each program instance to have its own
|
||||
pointer value.
|
||||
|
||||
References are represented a pointer in the code generated by ``ispc``,
|
||||
though this is generally opaque to the user; in ``ispc``, they are
|
||||
specifically uniform pointers. This design decision was made so that given
|
||||
code like this:
|
||||
|
||||
::
|
||||
|
||||
extern void func(float &val);
|
||||
float foo = ...;
|
||||
func(foo);
|
||||
|
||||
Then the reference would be handled efficiently as a single pointer, rather
|
||||
than unnecessarily being turned into a gang-size of pointers.
|
||||
|
||||
However, an implication of this decision is that it's not possible for
|
||||
references to refer to completely different things for each of the program
|
||||
instances. (And hence the error that is issued). In cases where a unique
|
||||
per-program-instance pointer is needed, a varying pointer should be used
|
||||
instead of a reference.
|
||||
|
||||
|
||||
Interoperability
|
||||
================
|
||||
|
||||
How can I supply an initial execution mask in the call from the application?
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
Recall that when execution transitions from the application code to an
|
||||
``ispc`` function, all of the program instances are initially executing.
|
||||
In some cases, it may desired that only some of them are running, based on
|
||||
a data-dependent condition computed in the application program. This
|
||||
situation can easily be handled via an additional parameter from the
|
||||
application.
|
||||
|
||||
As a simple example, consider a case where the application code has an
|
||||
array of ``float`` values and we'd like the ``ispc`` code to update
|
||||
just specific values in that array, where which of those values to be
|
||||
updated has been determined by the application. In C++ code, we might
|
||||
have:
|
||||
|
||||
::
|
||||
|
||||
int count = ...;
|
||||
float *array = new float[count];
|
||||
bool *shouldUpdate = new bool[count];
|
||||
// initialize array and shouldUpdate
|
||||
ispc_func(array, shouldUpdate, count);
|
||||
|
||||
Then, the ``ispc`` code could process this update as:
|
||||
|
||||
::
|
||||
|
||||
export void ispc_func(uniform float array[], uniform bool update[],
|
||||
uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
cif (update[i] == true)
|
||||
// update array[i+programIndex]...
|
||||
}
|
||||
}
|
||||
|
||||
(In this case a "coherent" if statement is likely to be worthwhile if the
|
||||
``update`` array will tend to have sections that are either all-true or
|
||||
all-false.)
|
||||
|
||||
How can I generate a single binary executable with support for multiple instruction sets?
|
||||
-----------------------------------------------------------------------------------------
|
||||
|
||||
``ispc`` can also generate output that supports multiple target instruction
|
||||
sets, also generating code that chooses the most appropriate one at runtime
|
||||
if multiple targets are specified with the ``--target`` command-line
|
||||
argument.
|
||||
|
||||
For example, if you run the command:
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
|
||||
|
||||
Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
|
||||
``foo_avx.o``, and ``foo.o``.[#]_ Link all of these into your executable, and
|
||||
when you call a function in ``foo.ispc`` from your application code,
|
||||
``ispc`` will determine which instruction sets are supported by the CPU the
|
||||
code is running on and will call the most appropriate version of the
|
||||
function available.
|
||||
|
||||
.. [#] Similarly, if you choose to generate assembly language output or
|
||||
LLVM bitcode output, multiple versions of those files will be created.
|
||||
|
||||
In general, the version of the function that runs will be the one in the
|
||||
most general instruction set that is supported by the system. If you only
|
||||
compile SSE2 and SSE4 variants and run on a system that supports AVX, for
|
||||
example, then the SSE4 variant will be executed. If the system doesn't
|
||||
is not able to run any of the available variants of the function (for
|
||||
example, trying to run a function that only has SSE4 and AVX variants on a
|
||||
system that only supports SSE2), then the standard library ``abort()``
|
||||
function will be called.
|
||||
|
||||
One subtlety is that all non-static global variables (if any) must have the
|
||||
same size and layout with all of the targets used. For example, if you
|
||||
have the global variables:
|
||||
|
||||
::
|
||||
|
||||
uniform int foo[2*programCount];
|
||||
int bar;
|
||||
|
||||
and compile to both SSE2 and AVX targets, both of these variables will have
|
||||
different sizes (the first due to program count having the value 4 for SSE2
|
||||
and 8 for AVX, and the second due to ``varying`` types having different
|
||||
numbers of elements with the two targets--essentially the same issue as the
|
||||
first.) ``ispc`` issues an error in this case.
|
||||
|
||||
|
||||
How can I determine at run-time which vector instruction set's instructions were selected to execute?
|
||||
-----------------------------------------------------------------------------------------------------
|
||||
|
||||
``ispc`` doesn't provide any API that allows querying which vector ISA's
|
||||
instructions are running when multi-target compilation was used. However,
|
||||
this can be solved in "user space" by writing a small helper function.
|
||||
Specifically, if you implement a function like this
|
||||
|
||||
::
|
||||
|
||||
export uniform int isa() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
return 0;
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
return 1;
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
return 2;
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
And then call it from your application code at runtime, it will return 0,
|
||||
1, or 2, depending on which target's instructions are running.
|
||||
|
||||
The way this works is a little surprising, but it's a useful trick. Of
|
||||
course the preprocessor ``#if`` checks are all compile-time only
|
||||
operations. What's actually happening is that the function is compiled
|
||||
multiple times, once for each target, with the appropriate ``ISPC_TARGET``
|
||||
preprocessor symbol set. Then, a small dispatch function is generated for
|
||||
the application to actually call. This dispatch function in turn calls the
|
||||
appropriate version of the function based on the CPU of the system it's
|
||||
executing on, which in turn returns the appropriate value.
|
||||
|
||||
In a similar fashion, it's possible to find out at run-time the value of
|
||||
``programCount`` for the target that's actually being used.
|
||||
|
||||
::
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
Is it possible to inline ispc functions in C/C++ code?
|
||||
------------------------------------------------------
|
||||
|
||||
If you're willing to use the ``clang`` C/C++ compiler that's part of the
|
||||
LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
|
||||
(and conversely, to inline C/C++ calls in ``ispc``). Doing so can provide
|
||||
performance advantages when calling out to short functions written in the
|
||||
"other" language. Note that you don't need to use ``clang`` to compile all
|
||||
of your C/C++ code, but only for the files where you want to be able to
|
||||
inline. In order to do this, you must have a full installation of LLVM
|
||||
version 3.0 or later, including the ``clang`` compiler.
|
||||
|
||||
The basic approach is to have the various compilers emit LLVM intermediate
|
||||
representation (IR) code and to then use tools from LLVM to link together
|
||||
the IR from the compilers and then re-optimize it, which gives the LLVM
|
||||
optimizer the opportunity to do additional inlining and cross-function
|
||||
optimizations. If you have source files ``foo.ispc`` and ``foo.cpp``,
|
||||
first emit LLVM IR:
|
||||
|
||||
::
|
||||
|
||||
ispc --emit-llvm -o foo_ispc.bc foo.ispc
|
||||
clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
|
||||
|
||||
Next, link the two IR files into a single file and run the LLVM optimizer
|
||||
on the result:
|
||||
|
||||
::
|
||||
|
||||
llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
|
||||
|
||||
And finally, generate a native object file:
|
||||
|
||||
::
|
||||
|
||||
llc -filetype=obj foo_opt.bc -o foo.o
|
||||
|
||||
This file can in turn be linked in with the rest of your object files when
|
||||
linking your applicaiton.
|
||||
|
||||
(Note that if you're using the AVX instruction set, you must provide the
|
||||
``-mattr=+avx`` flag to ``llc``.)
|
||||
|
||||
|
||||
Why is it illegal to pass "varying" values from C/C++ to ispc functions?
|
||||
------------------------------------------------------------------------
|
||||
|
||||
If any of the types in the parameter list to an exported function is
|
||||
"varying" (including recursively, and members of structure types, etc.),
|
||||
then ``ispc`` will issue an error and refuse to compile the function:
|
||||
|
||||
::
|
||||
|
||||
% echo "export int add(int x) { return ++x; }" | ispc
|
||||
<stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo"
|
||||
<stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function.
|
||||
|
||||
While there's no fundamental reason why this isn't possible, recall the
|
||||
definition of "varying" variables: they have one value for each program
|
||||
instance in the gang. As such, the number of values and amount of storage
|
||||
required to represent a varying variable depends on the gang size
|
||||
(i.e. ``programCount``), which can have different values depending on the
|
||||
compilation target.
|
||||
|
||||
``ispc`` therefore prohibits passing "varying" values between the
|
||||
application and the ``ispc`` program in order to prevent the
|
||||
application-side code from depending on a particular gang size, in order to
|
||||
encourage portability to different gang sizes. (A generally desirable
|
||||
programming practice.)
|
||||
|
||||
For cases where the size of data is actually fixed from the application
|
||||
side, the value can be passed via a pointer to a short ``uniform`` array,
|
||||
as follows:
|
||||
|
||||
::
|
||||
|
||||
export void add4(uniform int ptr[4]) {
|
||||
foreach (i = 0 ... 4)
|
||||
ptr[i]++;
|
||||
}
|
||||
|
||||
On the 4-wide SSE instruction set, this compiles to a single vector add
|
||||
instruction (and associated move instructions), while it still also
|
||||
efficiently computes the correct result on 8-wide AVX targets.
|
||||
|
||||
|
||||
Programming Techniques
|
||||
======================
|
||||
|
||||
What primitives are there for communicating between SPMD program instances?
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
|
||||
routines provide a variety of mechanisms for the running program instances
|
||||
to communicate values to each other during execution. Note that there's no
|
||||
need to synchronize the program instances before communicating between
|
||||
them, due to the synchronized execution model of gangs of program instances
|
||||
in ``ispc``.
|
||||
|
||||
How can a gang of program instances generate variable amounts of output efficiently?
|
||||
------------------------------------------------------------------------------------
|
||||
|
||||
It's not unusual to have a gang of program instances where each program
|
||||
instance generates a variable amount of output (perhaps some generate no
|
||||
output, some generate one output value, some generate many output values
|
||||
and so forth), and where one would like to have the output densely packed
|
||||
in an output array. The ``exclusive_scan_add()`` function from the
|
||||
standard library is quite useful in this situation.
|
||||
|
||||
Consider the following function:
|
||||
|
||||
::
|
||||
|
||||
uniform int func(uniform float outArray[], ...) {
|
||||
int numOut = ...; // figure out how many to be output
|
||||
float outLocal[MAX_OUT]; // staging area
|
||||
|
||||
// each program instance in the gang puts its results in
|
||||
// outLocal[0], ..., outLocal[numOut-1]
|
||||
|
||||
int startOffset = exclusive_scan_add(numOut);
|
||||
for (int i = 0; i < numOut; ++i)
|
||||
outArray[startOffset + i] = outLocal[i];
|
||||
return reduce_add(numOut);
|
||||
}
|
||||
|
||||
Here, each program instance has computed a number, ``numOut``, of values to
|
||||
output, and has stored them in the ``outLocal`` array. Assume that four
|
||||
program instances are running and that the first one wants to output one
|
||||
value, the second two values, and the third and fourth three values each.
|
||||
In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
|
||||
to the four program instances, respectively.
|
||||
|
||||
The first program instance will then write its one result to
|
||||
``outArray[0]``, the second will write its two values to ``outArray[1]``
|
||||
and ``outArray[2]``, and so forth. The ``reduce_add()`` call at the end
|
||||
returns the total number of values that all of the program instances have
|
||||
written to the array.
|
||||
|
||||
FIXME: add discussion of foreach_active as an option here once that's in
|
||||
|
||||
Is it possible to use ispc for explicit vector programming?
|
||||
-----------------------------------------------------------
|
||||
|
||||
The typical model for programming in ``ispc`` is an *implicit* parallel
|
||||
model, where one writes a program that is apparently doing scalar
|
||||
computation on values and the program is then vectorized to run in parallel
|
||||
across the SIMD lanes of a processor. However, ``ispc`` also has some
|
||||
support for explicit vector unit programming, where the vectorization is
|
||||
explicit. Some computations may be more effectively described in the
|
||||
explicit model rather than the implicit model.
|
||||
|
||||
This support is provided via ``uniform`` instances of short vectors
|
||||
Specifically, if this short program
|
||||
|
||||
::
|
||||
|
||||
export uniform float<8> madd(uniform float<8> a, uniform float<8> b,
|
||||
uniform float<8> c) {
|
||||
return a + b * c;
|
||||
}
|
||||
|
||||
is compiled with the AVX target, ``ispc`` generates the following assembly:
|
||||
|
||||
::
|
||||
|
||||
_madd:
|
||||
vmulps %ymm2, %ymm1, %ymm1
|
||||
vaddps %ymm0, %ymm1, %ymm0
|
||||
ret
|
||||
|
||||
(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
|
||||
``addps`` instructions are generated, and so forth.)
|
||||
|
||||
Note that ``ispc`` doesn't currently support control-flow based on
|
||||
``uniform`` short vector types; it is thus not possible to write code like:
|
||||
|
||||
::
|
||||
|
||||
export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
|
||||
uniform int<8> sum = 0;
|
||||
while (a++ < b)
|
||||
++sum;
|
||||
}
|
||||
|
||||
|
||||
How can I debug my ispc programs using Valgrind?
|
||||
------------------------------------------------
|
||||
|
||||
The `valgrind`_ memory checker is an extremely useful memory checker for
|
||||
Linux and OSX; it detects a range of memory errors, including accessing
|
||||
memory after it has been freed, accessing memory beyond the end of an
|
||||
array, accessing uninitialized stack variables, and so forth.
|
||||
In general, applications that use ``ispc`` code run with ``valgrind``
|
||||
without modification and ``valgrind`` will detect the same range of memory
|
||||
errors in ``ispc`` code that it does in C/C++ code.
|
||||
|
||||
.. _valgrind: http://valgrind.org
|
||||
|
||||
One issue to be aware of is that until recently, ``valgrind`` only
|
||||
supported the SSE2 vector instructions; if you are using a version of
|
||||
``valgrind`` older than the 3.7.0 release (5 November 2011), you should
|
||||
compile your ``ispc`` programs with ``--target=sse2`` before running them
|
||||
through ``valgrind``. (Note that if no target is specified, then ``ispc``
|
||||
chooses a target based on the capabilities of the system you're running
|
||||
``ispc`` on.) If you run an ``ispc`` program that uses instructions that
|
||||
``valgrind`` doesn't support, you'll see an error message like:
|
||||
|
||||
::
|
||||
|
||||
vex amd64->IR: unhandled instruction bytes: 0xC5 0xFA 0x10 0x0 0xC5 0xFA 0x11 0x84
|
||||
==46059== valgrind: Unrecognised instruction at address 0x100002707.
|
||||
|
||||
The just-released valgrind 3.7.0 adds support for the SSE4.2 instruction
|
||||
set; if you're using that version (and your system supports SSE4.2), then
|
||||
you can use ``--target=sse4`` when compiling to run with ``valgrind``.
|
||||
|
||||
Note that ``valgrind`` does not yet support programs that use the AVX
|
||||
instruction set.
|
||||
|
||||
foreach statements generate more complex assembly than I'd expect; what's going on?
|
||||
-----------------------------------------------------------------------------------
|
||||
|
||||
Given a simple ``foreach`` loop like the following:
|
||||
|
||||
::
|
||||
|
||||
void foo(uniform float a[], uniform int count) {
|
||||
foreach (i = 0 ... count)
|
||||
a[i] *= 2;
|
||||
}
|
||||
|
||||
|
||||
the ``ispc`` compiler generates approximately 40 instructions--why isn't
|
||||
the generated code simpler?
|
||||
|
||||
There are two main components to the code: one handles
|
||||
``programCount``-sized chunks of elements of the array, and the other
|
||||
handles any excess elements at the end of the array that don't completely
|
||||
fill a gang. The code for the main loop is essentially what one would
|
||||
expect: a vector of values are laoded from the array, the multiply is done,
|
||||
and the result is stored.
|
||||
|
||||
::
|
||||
|
||||
LBB0_2: ## %foreach_full_body
|
||||
movslq %edx, %rdx
|
||||
vmovups (%rdi,%rdx), %ymm1
|
||||
vmulps %ymm0, %ymm1, %ymm1
|
||||
vmovups %ymm1, (%rdi,%rdx)
|
||||
addl $32, %edx
|
||||
addl $8, %eax
|
||||
cmpl %ecx, %eax
|
||||
jl LBB0_2
|
||||
|
||||
|
||||
Then, there is a sequence of instructions that handles any additional
|
||||
elements at the end of the array. (These instructions don't execute if
|
||||
there aren't any left-over values to process, but they do lengthen the
|
||||
amount of generated code.)
|
||||
|
||||
::
|
||||
|
||||
## BB#4: ## %partial_inner_only
|
||||
vmovd %eax, %xmm0
|
||||
vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
vpermilps $0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
vextractf128 $1, %ymm0, %xmm3
|
||||
vmovd %esi, %xmm2
|
||||
vmovaps LCPI0_1(%rip), %ymm1
|
||||
vextractf128 $1, %ymm1, %xmm4
|
||||
vpaddd %xmm4, %xmm3, %xmm3
|
||||
# ....
|
||||
vmulps LCPI0_0(%rip), %ymm1, %ymm1
|
||||
vmaskmovps %ymm1, %ymm0, (%rdi,%rax)
|
||||
|
||||
|
||||
If you know that the number of elements to be processed will always be an
|
||||
exact multiple of the 8, 16, etc., then adding a simple assignment to
|
||||
``count`` like the one below gives the compiler enough information to be
|
||||
able to eliminate the code for the additional array elements.
|
||||
|
||||
::
|
||||
|
||||
void foo(uniform float a[], uniform int count) {
|
||||
// This assignment doesn't change the value of count
|
||||
// if it's a multiple of 16, but it gives the compiler
|
||||
// insight into this fact, allowing for simpler code to
|
||||
// be generated for the foreach loop.
|
||||
count = (count & ~(16-1));
|
||||
foreach (i = 0 ... count)
|
||||
a[i] *= 2;
|
||||
}
|
||||
|
||||
With this new version of ``foo()``, only the code for the first loop above
|
||||
is generated.
|
||||
|
||||
|
||||
How do I launch an individual task for each active program instance?
|
||||
--------------------------------------------------------------------
|
||||
|
||||
Recall from the `discussion of "launch" in the ispc User's Guide`_ that a
|
||||
``launch`` statement launches a single task corresponding to a single gang
|
||||
of executing program instances, where the indices of the active program
|
||||
instances are the same as were active when the ``launch`` statement
|
||||
executed.
|
||||
|
||||
.. _discussion of "launch" in the ispc User's Guide: ispc.html#task-parallelism-launch-and-sync-statements
|
||||
|
||||
In some situations, it's desirable to be able to launch an individual task
|
||||
for each executing program instance. For example, we might be performing
|
||||
an iterative computation where a subset of the program instances determine
|
||||
that an item they are responsible for requires additional processing.
|
||||
|
||||
::
|
||||
|
||||
bool itemNeedsMoreProcessing(int);
|
||||
int itemNum = ...;
|
||||
if (itemNeedsMoreProcessing(itemNum)) {
|
||||
// do additional work
|
||||
}
|
||||
|
||||
For performance reasons, it may be desirable to apply an entire gang's
|
||||
worth of comptuation to each item that needs additional processing;
|
||||
there may be available parallelism in this computation such that we'd like
|
||||
to process each of the items with SPMD computation.
|
||||
|
||||
In this case, the ``foreach_active`` and ``unmasked`` constructs can be
|
||||
applied together to accomplish this goal.
|
||||
|
||||
::
|
||||
|
||||
// do additional work
|
||||
task void doWork(uniform int index);
|
||||
foreach_active (index) {
|
||||
unmasked {
|
||||
launch doWork(extract(itemNum, index));
|
||||
}
|
||||
}
|
||||
|
||||
Recall that the body of the ``foreach_active`` loop runs once for each
|
||||
active program instance, with each active program instance's
|
||||
``programIndex`` value available in ``index`` in the above. In the loop,
|
||||
we can re-establish an "all on" execution mask, enabling execution in all
|
||||
of the program instances in the gang, such that execution in ``doWork()``
|
||||
starts with all instances running. (Alternatively, the ``unmasked`` block
|
||||
could be in the definition of ``doWork()``.)
|
||||
|
||||
4674
docs/ispc.rst
Normal file
4674
docs/ispc.rst
Normal file
File diff suppressed because it is too large
Load Diff
2939
docs/ispc.txt
2939
docs/ispc.txt
File diff suppressed because it is too large
Load Diff
71
docs/news.rst
Normal file
71
docs/news.rst
Normal file
@@ -0,0 +1,71 @@
|
||||
=========
|
||||
ispc News
|
||||
=========
|
||||
|
||||
ispc 1.3.0 is Released
|
||||
----------------------
|
||||
|
||||
A major new version of ``ispc`` has been released. In addition to a number
|
||||
of new language features, this release notably features initial support for
|
||||
compiling to the Intel Xeon Phi (Many Integrated Core) architecture.
|
||||
|
||||
ispc 1.2.1 is Released
|
||||
----------------------
|
||||
|
||||
This is a bugfix release, fixing approximately 20 bugs in the system and
|
||||
improving error handling and error reporting. New functionality includes
|
||||
very efficient float/half conversion routines thanks to Fabian
|
||||
Giesen. See the `1.2.1 release notes`_ for details.
|
||||
|
||||
.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
|
||||
|
||||
ispc 1.2.0 is Released
|
||||
-----------------------
|
||||
|
||||
A new major release was posted on March 20, 2012. This release includes
|
||||
significant new functionality for cleanly handling "structure of arrays"
|
||||
(SoA) data layout and a new model for how uniform and varying are handled
|
||||
with structure types.
|
||||
|
||||
Paper on ispc To Appear in InPar 2012
|
||||
-------------------------------------
|
||||
|
||||
A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
|
||||
CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
|
||||
the `InPar 2012`_ conference. This paper describes a number of the design
|
||||
features and key characteristics of the ``ispc`` implementation.
|
||||
|
||||
(© 2012 IEEE. Personal use of this material is permitted. Permission from
|
||||
IEEE must be obtained for all other uses, in any current or future media,
|
||||
including reprinting/republishing this material for advertising or
|
||||
promotional purposes, creating new collective works, for resale or
|
||||
redistribution to servers or lists, or reuse of any copyrighted component
|
||||
of this work in other works.).
|
||||
|
||||
.. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
|
||||
.. _InPar 2012: http://innovativeparallel.org/
|
||||
|
||||
ispc 1.1.4 is Released
|
||||
----------------------
|
||||
|
||||
On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
|
||||
include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
|
||||
programs, "local" atomic operations in the standard library, and a new
|
||||
scalar compilation target. See the `1.1.4 release notes`_ for details.
|
||||
|
||||
.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
|
||||
|
||||
|
||||
ispc 1.1.3 is Released
|
||||
----------------------
|
||||
|
||||
With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
|
||||
|
||||
This release includes fixes for two important performance related issues:
|
||||
the quality of code generated for "foreach" statements has been
|
||||
substantially improved, and performance regression with code for "gathers"
|
||||
that was introduced in v1.1.2 has been fixed in this release.
|
||||
|
||||
Thanks to Jean-Luc Duprat for a number of patches that improve support for
|
||||
building on various platforms, and to Pierre-Antoine Lacaze for patches so
|
||||
that ispc builds under MinGW.
|
||||
85
docs/perf.rst
Normal file
85
docs/perf.rst
Normal file
@@ -0,0 +1,85 @@
|
||||
===========
|
||||
Performance
|
||||
===========
|
||||
|
||||
The SPMD programming model that ``ispc`` makes it easy to harness the
|
||||
computational power available in SIMD vector units on modern CPUs, while
|
||||
its basis in C makes it easy for programmers to adopt and use
|
||||
productively. This page summarizes the performance of ``ispc`` with the
|
||||
workloads in the ``examples/`` directory of the ``ispc`` distribution.
|
||||
|
||||
These results were measured on a 4-core Apple iMac with a 4-core 3.4GHz
|
||||
Intel® Core-i7 processor using the Intel® AVX instruction set. The basis
|
||||
for comparison is a reference C++ implementation compiled with gcc 4.2.1,
|
||||
the version distributed with OS X 10.7.2. (The reference implementation is
|
||||
also included in the ``examples/`` directory.)
|
||||
|
||||
.. list-table:: Performance of ``ispc`` with a variety of the workloads
|
||||
from the ``examples/`` directory of the ``ispc`` distribution, compared
|
||||
a reference C++ implementation compiled with gcc 4.2.1.
|
||||
|
||||
* - Workload
|
||||
- ``ispc``, 1 core
|
||||
- ``ispc``, 4 cores
|
||||
* - `AOBench`_ (512 x 512 resolution)
|
||||
- 6.19x
|
||||
- 28.06x
|
||||
* - `Binomial Options`_ (128k options)
|
||||
- 7.94x
|
||||
- 33.43x
|
||||
* - `Black-Scholes Options`_ (128k options)
|
||||
- 8.45x
|
||||
- 32.48x
|
||||
* - `Deferred Shading`_ (1280p)
|
||||
- 5.02x
|
||||
- 23.06x
|
||||
* - `Mandelbrot Set`_
|
||||
- 6.21x
|
||||
- 20.28x
|
||||
* - `Perlin Noise Function`_
|
||||
- 5.37x
|
||||
- n/a
|
||||
* - `Ray Tracer`_ (Sponza dataset)
|
||||
- 4.31x
|
||||
- 20.29x
|
||||
* - `3D Stencil`_
|
||||
- 4.05x
|
||||
- 15.53x
|
||||
* - `Volume Rendering`_
|
||||
- 3.60x
|
||||
- 17.53x
|
||||
|
||||
|
||||
.. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
|
||||
.. _Binomial Options: https://github.com/ispc/ispc/tree/master/examples/options
|
||||
.. _Black-Scholes Options: https://github.com/ispc/ispc/tree/master/examples/options
|
||||
.. _Deferred Shading: https://github.com/ispc/ispc/tree/master/examples/deferred
|
||||
.. _Mandelbrot Set: https://github.com/ispc/ispc/tree/master/examples/mandelbrot_tasks
|
||||
.. _Ray Tracer: https://github.com/ispc/ispc/tree/master/examples/rt
|
||||
.. _Perlin Noise Function: https://github.com/ispc/ispc/tree/master/examples/noise
|
||||
.. _3D Stencil: https://github.com/ispc/ispc/tree/master/examples/stencil
|
||||
.. _Volume Rendering: https://github.com/ispc/ispc/tree/master/examples/volume_rendering
|
||||
|
||||
|
||||
The following table shows speedups for a number of the examples on a
|
||||
2.40GHz, 40-core Intel® Xeon E7-8870 system with the Intel® SSE4
|
||||
instruction set, running Microsoft Windows Server 2008 Enterprise. Here,
|
||||
the serial C/C++ baseline code was compiled with MSVC 2010.
|
||||
|
||||
.. list-table:: Performance of ``ispc`` with a variety of the workloads
|
||||
from the ``examples/`` directory of the ``ispc`` distribution, on
|
||||
system with 40 CPU cores.
|
||||
|
||||
* - Workload
|
||||
- ``ispc``, 40 cores
|
||||
* - AOBench (2048 x 2048 resolution)
|
||||
- 182.36x
|
||||
* - Binomial Options (2m options)
|
||||
- 63.85x
|
||||
* - Black-Scholes Options (2m options)
|
||||
- 83.97x
|
||||
* - Ray Tracer (Sponza dataset)
|
||||
- 195.67x
|
||||
* - Volume Rendering
|
||||
- 243.18x
|
||||
|
||||
829
docs/perfguide.rst
Normal file
829
docs/perfguide.rst
Normal file
@@ -0,0 +1,829 @@
|
||||
==============================================
|
||||
Intel® SPMD Program Compiler Performance Guide
|
||||
==============================================
|
||||
|
||||
The SPMD programming model provided by ``ispc`` naturally delivers
|
||||
excellent performance for many workloads thanks to efficient use of CPU
|
||||
SIMD vector hardware. This guide provides more details about how to get
|
||||
the most out of ``ispc`` in practice.
|
||||
|
||||
* `Key Concepts`_
|
||||
|
||||
+ `Efficient Iteration With "foreach"`_
|
||||
+ `Improving Control Flow Coherence With "foreach_tiled"`_
|
||||
+ `Using Coherent Control Flow Constructs`_
|
||||
+ `Use "uniform" Whenever Appropriate`_
|
||||
+ `Use "Structure of Arrays" Layout When Possible`_
|
||||
|
||||
* `Tips and Techniques`_
|
||||
|
||||
+ `Understanding Gather and Scatter`_
|
||||
+ `Avoid 64-bit Addressing Calculations When Possible`_
|
||||
+ `Avoid Computation With 8 and 16-bit Integer Types`_
|
||||
+ `Implementing Reductions Efficiently`_
|
||||
+ `Using "foreach_active" Effectively`_
|
||||
+ `Using Low-level Vector Tricks`_
|
||||
+ `The "Fast math" Option`_
|
||||
+ `"inline" Aggressively`_
|
||||
+ `Avoid The System Math Library`_
|
||||
+ `Declare Variables In The Scope Where They're Used`_
|
||||
+ `Instrumenting ISPC Programs To Understand Runtime Behavior`_
|
||||
+ `Choosing A Target Vector Width`_
|
||||
|
||||
* `Disclaimer and Legal Information`_
|
||||
|
||||
* `Optimization Notice`_
|
||||
|
||||
Key Concepts
|
||||
============
|
||||
|
||||
This section describes the four most important concepts to understand and
|
||||
keep in mind when writing high-performance ``ispc`` programs. It assumes
|
||||
good familiarity with the topics covered in the ``ispc`` `Users Guide`_.
|
||||
|
||||
.. _Users Guide: ispc.html
|
||||
|
||||
Efficient Iteration With "foreach"
|
||||
----------------------------------
|
||||
|
||||
The ``foreach`` parallel iteration construct is semantically equivalent to
|
||||
a regular ``for()`` loop, though it offers meaningful performance benefits.
|
||||
(See the `documentation on "foreach" in the Users Guide`_ for a review of
|
||||
its syntax and semantics.) As an example, consider this simple function
|
||||
that iterates over some number of elements in an array, doing computation
|
||||
on each one:
|
||||
|
||||
.. _documentation on "foreach" in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
|
||||
|
||||
::
|
||||
|
||||
export void foo(uniform int a[], uniform int count) {
|
||||
for (int i = programIndex; i < count; i += programCount) {
|
||||
// do some computation on a[i]
|
||||
}
|
||||
}
|
||||
|
||||
Depending on the specifics of the computation being performed, the code
|
||||
generated for this function could likely be improved by modifying the code
|
||||
so that the loop only goes as far through the data as is possible to pack
|
||||
an entire gang of program instances with computation each time through the
|
||||
loop. Doing so enables the ``ispc`` compiler to generate more efficient
|
||||
code for cases where it knows that the execution mask is "all on". Then,
|
||||
an ``if`` statement at the end handles processing the ragged extra bits of
|
||||
data that didn't fully fill a gang.
|
||||
|
||||
::
|
||||
|
||||
export void foo(uniform int a[], uniform int count) {
|
||||
// First, just loop up to the point where all program instances
|
||||
// in the gang will be active at the loop iteration start
|
||||
uniform int countBase = count & ~(programCount-1);
|
||||
for (uniform int i = 0; i < countBase; i += programCount) {
|
||||
int index = i + programIndex;
|
||||
// do some computation on a[index]
|
||||
}
|
||||
// Now handle the ragged extra bits at the end
|
||||
if (countBase < count) {
|
||||
int index = countBase + programIndex;
|
||||
// do some computation on a[index]
|
||||
}
|
||||
}
|
||||
|
||||
While the performance of the above code will likely be better than the
|
||||
first version of the function, the loop body code has been duplicated (or
|
||||
has been forced to move into a separate utility function).
|
||||
|
||||
Using the ``foreach`` looping construct as below provides all of the
|
||||
performance benefits of the second version of this function, with the
|
||||
compactness of the first.
|
||||
|
||||
::
|
||||
|
||||
export void foo(uniform int a[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
// do some computation on a[i]
|
||||
}
|
||||
}
|
||||
|
||||
Improving Control Flow Coherence With "foreach_tiled"
|
||||
-----------------------------------------------------
|
||||
|
||||
Depending on the computation being performed, ``foreach_tiled`` may give
|
||||
better performance than ``foreach``. (See the `documentation in the Users
|
||||
Guide`_ for the syntax and semantics of ``foreach_tiled``.) Given a
|
||||
multi-dimensional iteration like:
|
||||
|
||||
.. _documentation in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
|
||||
|
||||
::
|
||||
|
||||
foreach (i = 0 ... width, j = 0 ... height) {
|
||||
// do computation on element (i,j)
|
||||
}
|
||||
|
||||
if the ``foreach`` statement is used, elements in the gang of program
|
||||
instances will be mapped to values of ``i`` and ``j`` by taking spans of
|
||||
``programCount`` elements across ``i`` with a single value of ``j``. For
|
||||
example, the ``foreach`` statement above roughly corresponds to:
|
||||
|
||||
::
|
||||
|
||||
for (uniform int j = 0; j < height; ++j)
|
||||
for (int i = 0; i < width; i += programCount) {
|
||||
// do computation
|
||||
}
|
||||
|
||||
When a multi-dimensional domain is being iterated over, ``foreach_tiled``
|
||||
statement maps program instances to data in a way that tries to select
|
||||
square n-dimensional segments of the domain. For example, on a compilation
|
||||
target with 8-wide gangs of program instances, it generates code that
|
||||
iterates over the domain the same way as the following code (though more
|
||||
efficiently):
|
||||
|
||||
::
|
||||
|
||||
for (int j = programIndex/4; j < height; j += 2)
|
||||
for (int i = programIndex%4; i < width; i += 4) {
|
||||
// do computation
|
||||
}
|
||||
|
||||
Thus, each gang of program instances operates on a 2x4 tile of the domain.
|
||||
With higher-dimensional iteration and different gang sizes, a similar
|
||||
mapping is performed--e.g. for 2D iteration with a 16-wide gang size, 4x4
|
||||
tiles are iterated over; for 4D iteration with a 8-gang, 1x2x2x2 tiles are
|
||||
processed, and so forth.
|
||||
|
||||
Performance benefit can come from using ``foreach_tiled`` in that it
|
||||
essentially optimizes for the benefit of iterating over *compact* regions
|
||||
of the domain (while ``foreach`` iterates over the domain in a way that
|
||||
generally allows linear memory access.) There are two benefits from
|
||||
processing compact regions of the domain.
|
||||
|
||||
First, it's often the case that the control flow coherence of the program
|
||||
instances in the gang is improved; if data-dependent control flow decisions
|
||||
are related to the values of the data in the domain being processed, and if
|
||||
the data values have some coherence, iterating with compact regions will
|
||||
improve control flow coherence.
|
||||
|
||||
Second, processing compact regions may mean that the data accessed by
|
||||
program instances in the gang is be more coherent, leading to performance
|
||||
benefits from better cache hit rates.
|
||||
|
||||
As a concrete example, for the ray tracer example in the ``ispc``
|
||||
distribution (in the ``examples/rt`` directory), performance is 20% better
|
||||
when the pixels are iterated over using ``foreach_tiled`` than ``foreach``,
|
||||
because more coherent regions of the scene are accessed by the set of rays
|
||||
in the gang of program instances.
|
||||
|
||||
|
||||
Using Coherent Control Flow Constructs
|
||||
--------------------------------------
|
||||
|
||||
Recall from the ``ispc`` Users Guide, in the `SPMD-on-SIMD Execution Model
|
||||
section`_ that ``if`` statements with a ``uniform`` test compile to more
|
||||
efficient code than ``if`` tests with varying tests. The coherent ``cif``
|
||||
statement can provide many benefits of ``if`` with a uniform test in the
|
||||
case where the test is actually varying.
|
||||
|
||||
.. _SPMD-on-SIMD Execution Model section: ispc.html#the-spmd-on-simd-execution-model
|
||||
|
||||
In this case, the code the compiler generates for the ``if``
|
||||
test is along the lines of the following pseudo-code:
|
||||
|
||||
::
|
||||
|
||||
bool expr = /* evaluate cif condition */
|
||||
if (all(expr)) {
|
||||
// run "true" case of if test only
|
||||
} else if (!any(expr)) {
|
||||
// run "false" case of if test only
|
||||
} else {
|
||||
// run both true and false cases, updating mask appropriately
|
||||
}
|
||||
|
||||
For ``if`` statements where the different running SPMD program instances
|
||||
don't have coherent values for the boolean ``if`` test, using ``cif``
|
||||
introduces some additional overhead from the ``all`` and ``any`` tests as
|
||||
well as the corresponding branches. For cases where the program
|
||||
instances often do compute the same boolean value, this overhead is
|
||||
worthwhile. If the control flow is in fact usually incoherent, this
|
||||
overhead only costs performance.
|
||||
|
||||
In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, and ``cdo``
|
||||
statements. These statements are semantically the same as the
|
||||
corresponding non-"c"-prefixed functions.
|
||||
|
||||
Use "uniform" Whenever Appropriate
|
||||
----------------------------------
|
||||
|
||||
For any variable that will always have the same value across all of the
|
||||
program instances in a gang, declare the variable with the ``uniform``
|
||||
qualifier. Doing so enables the ``ispc`` compiler to emit better code in
|
||||
many different ways.
|
||||
|
||||
As a simple example, consider a ``for`` loop that always does the same
|
||||
number of iterations:
|
||||
|
||||
::
|
||||
|
||||
for (int i = 0; i < 10; ++i)
|
||||
// do something ten times
|
||||
|
||||
If this is written with ``i`` as a ``varying`` variable, as above, there's
|
||||
additional overhead in the code generated for the loop as the compiler
|
||||
emits instructions to handle the possibility of not all program instances
|
||||
following the same control flow path (as might be the case if the loop
|
||||
limit, 10, was itself a ``varying`` value.)
|
||||
|
||||
If the above loop is instead written with ``i`` ``uniform``, as:
|
||||
|
||||
::
|
||||
|
||||
for (uniform int i = 0; i < 10; ++i)
|
||||
// do something ten times
|
||||
|
||||
Then better code can be generated (and the loop possibly unrolled).
|
||||
|
||||
In some cases, the compiler may be able to detect simple cases like these,
|
||||
but it's always best to provide the compiler with as much help as possible
|
||||
to understand the actual form of your computation.
|
||||
|
||||
|
||||
Use "Structure of Arrays" Layout When Possible
|
||||
----------------------------------------------
|
||||
|
||||
In general, memory access performance (for both reads and writes) is best
|
||||
when the running program instances access a contiguous region of memory; in
|
||||
this case efficient vector load and store instructions can often be used
|
||||
rather than gathers and scatters. As an example of this issue, consider an
|
||||
array of a simple point datatype laid out and accessed in conventional
|
||||
"array of structures" (AOS) layout:
|
||||
|
||||
::
|
||||
|
||||
struct Point { float x, y, z; };
|
||||
uniform Point pts[...];
|
||||
float v = pts[programIndex].x;
|
||||
|
||||
In the above code, the access to ``pts[programIndex].x`` accesses
|
||||
non-sequential memory locations, due to the ``y`` and ``z`` values between
|
||||
the desired ``x`` values in memory. A "gather" is required to get the
|
||||
value of ``v``, with a corresponding decrease in performance.
|
||||
|
||||
If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
|
||||
can be much more efficient:
|
||||
|
||||
::
|
||||
|
||||
struct Point8 { float x[8], y[8], z[8]; };
|
||||
uniform Point8 pts8[...];
|
||||
int majorIndex = programIndex / 8;
|
||||
int minorIndex = programIndex % 8;
|
||||
float v = pts8[majorIndex].x[minorIndex];
|
||||
|
||||
In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
|
||||
before 8 ``y`` values and then 8 ``z`` values. If the gang size is 8 or
|
||||
less, the access for ``v`` will have the same value of ``majorIndex`` for
|
||||
all program instances and will access consecutive elements of the ``x[8]``
|
||||
array with a vector load. (For larger gang sizes, two 8-wide vector loads
|
||||
would be issues, which is also quite efficient.)
|
||||
|
||||
However, the syntax in the above code is messy; accessing SOA data in this
|
||||
fashion is much less elegant than the corresponding code for accessing the
|
||||
data with AOS layout. The ``soa`` qualifier in ``ispc`` can be used to
|
||||
cause the corresponding transformation to be made to the ``Point`` type,
|
||||
while preserving the clean syntax for data access that comes with AOS
|
||||
layout:
|
||||
|
||||
::
|
||||
|
||||
soa<8> Point pts[...];
|
||||
float v = pts[programIndex].x;
|
||||
|
||||
Thanks to having SOA layout a first-class concept in the language's type
|
||||
system, it's easy to write functions that convert data between the
|
||||
layouts. For example, the ``aos_to_soa`` function below converts ``count``
|
||||
elements of the given ``Point`` type from AOS to 8-wide SOA layout. (It
|
||||
assumes that the caller has pre-allocated sufficient space in the
|
||||
``pts_soa`` output array.
|
||||
|
||||
::
|
||||
|
||||
void aos_to_soa(uniform Point pts_aos[], uniform int count,
|
||||
soa<8> pts_soa[]) {
|
||||
foreach (i = 0 ... count)
|
||||
pts_soa[i] = pts_aos[i];
|
||||
}
|
||||
|
||||
Analogously, a function could be written to convert back from SOA to AOS if
|
||||
needed.
|
||||
|
||||
|
||||
Tips and Techniques
|
||||
===================
|
||||
|
||||
This section introduces a number of additional techniques that are worth
|
||||
keeping in mind when writing ``ispc`` programs.
|
||||
|
||||
Understanding Gather and Scatter
|
||||
--------------------------------
|
||||
|
||||
Memory reads and writes from the program instances in a gang that access
|
||||
irregular memory locations (rather than a consecutive set of locations, or
|
||||
a single location) can be relatively inefficient. As an example, consider
|
||||
the "simple" array indexing calculation below:
|
||||
|
||||
::
|
||||
|
||||
int i = ....;
|
||||
uniform float x[10] = { ... };
|
||||
float f = x[i];
|
||||
|
||||
Since the index ``i`` is a varying value, the program instances in the gang
|
||||
will in general be reading different locations in the array ``x``. Because
|
||||
current CPUs have a "gather" instruction, the ``ispc`` compiler has to
|
||||
serialize these memory reads, performing a separate memory load for each
|
||||
running program instance, packing the result into ``f``. (The analogous
|
||||
case happens for a write into ``x[i]``.)
|
||||
|
||||
In many cases, gathers like these are unavoidable; the program instances
|
||||
just need to access incoherent memory locations. However, if the array
|
||||
index ``i`` actually has the same value for all of the program instances or
|
||||
if it represents an access to a consecutive set of array locations, much
|
||||
more efficient load and store instructions can be generated instead of
|
||||
gathers and scatters, respectively.
|
||||
|
||||
In many cases, the ``ispc`` compiler is able to deduce that the memory
|
||||
locations accessed by a varying index are either all the same or are
|
||||
uniform. For example, given:
|
||||
|
||||
::
|
||||
|
||||
uniform int x = ...;
|
||||
int y = x;
|
||||
return array[y];
|
||||
|
||||
The compiler is able to determine that all of the program instances are
|
||||
loading from the same location, even though ``y`` is not a ``uniform``
|
||||
variable. In this case, the compiler will transform this load to a regular
|
||||
vector load, rather than a general gather.
|
||||
|
||||
Sometimes the running program instances will access a linear sequence of
|
||||
memory locations; this happens most frequently when array indexing is done
|
||||
based on the built-in ``programIndex`` variable. In many of these cases,
|
||||
the compiler is also able to detect this case and then do a vector load.
|
||||
For example, given:
|
||||
|
||||
::
|
||||
|
||||
for (int i = programIndex; i < count; i += programCount)
|
||||
// process array[i];
|
||||
|
||||
Regular vector loads and stores are issued for accesses to ``array[i]``.
|
||||
|
||||
Both of these cases have been ones where the compiler is able to determine
|
||||
statically that the index has the same value at compile-time. It's
|
||||
often the case that this determination can't be made at compile time, but
|
||||
this is often the case at run time. The ``reduce_equal()`` function from
|
||||
the standard library can be used in this case; it checks to see if the
|
||||
given value is the same across over all of the running program instances,
|
||||
returning true and its ``uniform`` value if so.
|
||||
|
||||
The following function shows the use of ``reduce_equal()`` to check for an
|
||||
equal index at execution time and then either do a scalar load and
|
||||
broadcast or a general gather.
|
||||
|
||||
::
|
||||
|
||||
uniform float array[..] = { ... };
|
||||
float value;
|
||||
int i = ...;
|
||||
uniform int ui;
|
||||
if (reduce_equal(i, &ui) == true)
|
||||
value = array[ui]; // scalar load + broadcast
|
||||
else
|
||||
value = array[i]; // gather
|
||||
|
||||
For a simple case like the one above, the overhead of doing the
|
||||
``reduce_equal()`` check is likely not worthwhile compared to just always
|
||||
doing a gather. In more complex cases, where a number of accesses are done
|
||||
based on the index, it can be worth doing. See the example
|
||||
``examples/volume_rendering`` in the ``ispc`` distribution for the use of
|
||||
this technique in an instance where it is beneficial to performance.
|
||||
|
||||
Understanding Memory Read Coalescing
|
||||
------------------------------------
|
||||
|
||||
XXXX todo
|
||||
|
||||
|
||||
Avoid 64-bit Addressing Calculations When Possible
|
||||
--------------------------------------------------
|
||||
|
||||
Even when compiling to a 64-bit architecture target, ``ispc`` does many of
|
||||
the addressing calculations in 32-bit precision by default--this behavior
|
||||
can be overridden with the ``--addressing=64`` command-line argument. This
|
||||
option should only be used if it's necessary to be able to address over 4GB
|
||||
of memory in the ``ispc`` code, as it essentially doubles the cost of
|
||||
memory addressing calculations in the generated code.
|
||||
|
||||
Avoid Computation With 8 and 16-bit Integer Types
|
||||
-------------------------------------------------
|
||||
|
||||
The code generated for 8 and 16-bit integer types is generally not as
|
||||
efficient as the code generated for 32-bit integer types. It is generally
|
||||
worthwhile to use 32-bit integer types for intermediate computations, even
|
||||
if the final result will be stored in a smaller integer type.
|
||||
|
||||
Implementing Reductions Efficiently
|
||||
-----------------------------------
|
||||
|
||||
It's often necessary to compute a reduction over a data set--for example,
|
||||
one might want to add all of the values in an array, compute their minimum,
|
||||
etc. ``ispc`` provides a few capabilities that make it easy to efficiently
|
||||
compute reductions like these. However, it's important to use these
|
||||
capabilities appropriately for best results.
|
||||
|
||||
As an example, consider the task of computing the sum of all of the values
|
||||
in an array. In C code, we might have:
|
||||
|
||||
::
|
||||
|
||||
/* C implementation of a sum reduction */
|
||||
float sum(const float array[], int count) {
|
||||
float sum = 0;
|
||||
for (int i = 0; i < count; ++i)
|
||||
sum += array[i];
|
||||
return sum;
|
||||
}
|
||||
|
||||
Exactly this computation could also be expressed as a purely uniform
|
||||
computation in ``ispc``, though without any benefit from vectorization:
|
||||
|
||||
::
|
||||
|
||||
/* inefficient ispc implementation of a sum reduction */
|
||||
uniform float sum(const uniform float array[], uniform int count) {
|
||||
uniform float sum = 0;
|
||||
for (uniform int i = 0; i < count; ++i)
|
||||
sum += array[i];
|
||||
return sum;
|
||||
}
|
||||
|
||||
As a first try, one might try using the ``reduce_add()`` function from the
|
||||
``ispc`` standard library; it takes a ``varying`` value and returns the sum
|
||||
of that value across all of the active program instances.
|
||||
|
||||
::
|
||||
|
||||
/* inefficient ispc implementation of a sum reduction */
|
||||
uniform float sum(const uniform float array[], uniform int count) {
|
||||
uniform float sum = 0;
|
||||
foreach (i = 0 ... count)
|
||||
sum += reduce_add(array[i+programIndex]);
|
||||
return sum;
|
||||
}
|
||||
|
||||
This implementation loads a gang's worth of values from the array, one for
|
||||
each of the program instances, and then uses ``reduce_add()`` to reduce
|
||||
across the program instances and then update the sum. Unfortunately this
|
||||
approach loses most benefit from vectorization, as it does more work on the
|
||||
cross-program instance ``reduce_add()`` call than it saves from the vector
|
||||
load of values.
|
||||
|
||||
The most efficient approach is to do the reduction in two phases: rather
|
||||
than using a ``uniform`` variable to store the sum, we maintain a varying
|
||||
value, such that each program instance is effectively computing a local
|
||||
partial sum on the subset of array values that it has loaded from the
|
||||
array. When the loop over array elements concludes, a single call to
|
||||
``reduce_add()`` computes the final reduction across each of the program
|
||||
instances' elements of ``sum``. This approach effectively compiles to a
|
||||
single vector load and a single vector add for each loop iteration's of
|
||||
values--very efficient code in the end.
|
||||
|
||||
::
|
||||
|
||||
/* good ispc implementation of a sum reduction */
|
||||
uniform float sum(const uniform float array[], uniform int count) {
|
||||
float sum = 0;
|
||||
foreach (i = 0 ... count)
|
||||
sum += array[i+programIndex];
|
||||
return reduce_add(sum);
|
||||
}
|
||||
|
||||
Using "foreach_active" Effectively
|
||||
----------------------------------
|
||||
|
||||
For high-performance code,
|
||||
|
||||
For example, consider this segment of code, from the introduction of
|
||||
``foreach_active`` in the ispc User's Guide:
|
||||
|
||||
::
|
||||
|
||||
uniform float array[...] = { ... };
|
||||
int index = ...;
|
||||
foreach_active (i) {
|
||||
++array[index];
|
||||
}
|
||||
|
||||
Here, ``index`` was assumed to possibly have the same value for multiple
|
||||
program instances, so the updates to ``array[index]`` are serialized by the
|
||||
``foreach_active`` statement in order to not have undefined results when
|
||||
``index`` values do collide.
|
||||
|
||||
The code generated by the compiler can be improved in this case by making
|
||||
it clear that only a single element of the array is accessed by
|
||||
``array[index]`` and that thus a general gather or scatter isn't required.
|
||||
Specifically, by using the ``extract()`` function from the standard library
|
||||
to extract the current program instance's value of ``index`` into a
|
||||
``uniform`` variable and then using that to index into ``array``, as below,
|
||||
more efficient code is generated.
|
||||
|
||||
::
|
||||
|
||||
foreach_active (instanceNum) {
|
||||
uniform int unifIndex = extract(index, instanceNum);
|
||||
++array[unifIndex];
|
||||
}
|
||||
|
||||
|
||||
Using Low-level Vector Tricks
|
||||
-----------------------------
|
||||
|
||||
Many low-level Intel® SSE and AVX coding constructs can be implemented in
|
||||
``ispc`` code. The ``ispc`` standard library functions ``intbits()`` and
|
||||
``floatbits()`` are often useful in this context. Recall that
|
||||
``intbits()`` takes a ``float`` value and returns it as an integer where
|
||||
the bits of the integer are the same as the bit representation in memory of
|
||||
the ``float``. (In other words, it does *not* perform an integer to
|
||||
floating-point conversion.) ``floatbits()``, then, performs the inverse
|
||||
computation.
|
||||
|
||||
As an example of the use of these functions, the following code efficiently
|
||||
reverses the sign of the given values.
|
||||
|
||||
::
|
||||
|
||||
float flipsign(float a) {
|
||||
unsigned int i = intbits(a);
|
||||
i ^= 0x80000000;
|
||||
return floatbits(i);
|
||||
}
|
||||
|
||||
This code compiles down to a single XOR instruction.
|
||||
|
||||
The "Fast math" Option
|
||||
----------------------
|
||||
|
||||
``ispc`` has a ``--opt=fast-math`` command-line flag that enables a number of
|
||||
optimizations that may be undesirable in code where numerical precision is
|
||||
critically important. For many graphics applications, for example, the
|
||||
approximations introduced may be acceptable, however. The following two
|
||||
optimizations are performed when ``--opt=fast-math`` is used. By default, the
|
||||
``--opt=fast-math`` flag is off.
|
||||
|
||||
* Expressions like ``x / y``, where ``y`` is a compile-time constant, are
|
||||
transformed to ``x * (1./y)``, where the inverse value of ``y`` is
|
||||
precomputed at compile time.
|
||||
|
||||
* Expressions like ``x / y``, where ``y`` is not a compile-time constant,
|
||||
are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the
|
||||
approximate reciprocal instruction from the ``ispc`` standard library.
|
||||
|
||||
|
||||
"inline" Aggressively
|
||||
---------------------
|
||||
|
||||
Inlining functions aggressively is generally beneficial for performance
|
||||
with ``ispc``. Definitely use the ``inline`` qualifier for any short
|
||||
functions (a few lines long), and experiment with it for longer functions.
|
||||
|
||||
Avoid The System Math Library
|
||||
-----------------------------
|
||||
|
||||
The default math library for transcendentals and the like that ``ispc`` has
|
||||
higher error than the system's math library, though is much more efficient
|
||||
due to being vectorized across the program instances and due to the fact
|
||||
that the functions can be inlined in the final code. (It generally has
|
||||
errors in the range of 10ulps, while the system math library generally has
|
||||
no more than 1ulp of error for transcendentals.)
|
||||
|
||||
If the ``--math-lib=system`` command-line option is used when compiling an
|
||||
``ispc`` program, then calls to the system math library will be generated
|
||||
instead. This option should only be used if the higher precision is
|
||||
absolutely required as the performance impact of using it can be
|
||||
significant.
|
||||
|
||||
Declare Variables In The Scope Where They're Used
|
||||
-------------------------------------------------
|
||||
|
||||
Performance is slightly improved by declaring variables at the same block
|
||||
scope where they are first used. For example, in code like the
|
||||
following, if the lifetime of ``foo`` is only within the scope of the
|
||||
``if`` clause, write the code like this:
|
||||
|
||||
::
|
||||
|
||||
float func() {
|
||||
....
|
||||
if (x < y) {
|
||||
float foo;
|
||||
... use foo ...
|
||||
}
|
||||
}
|
||||
|
||||
Try not to write code as:
|
||||
|
||||
::
|
||||
|
||||
float func() {
|
||||
float foo;
|
||||
....
|
||||
if (x < y) {
|
||||
... use foo ...
|
||||
}
|
||||
}
|
||||
|
||||
Doing so can reduce the amount of masked store instructions that the
|
||||
compiler needs to generate.
|
||||
|
||||
Instrumenting ISPC Programs To Understand Runtime Behavior
|
||||
----------------------------------------------------------
|
||||
|
||||
``ispc`` has an optional instrumentation feature that can help you
|
||||
understand performance issues. If a program is compiled using the
|
||||
``--instrument`` flag, the compiler emits calls to a function with the
|
||||
following signature at various points in the program (for
|
||||
example, at interesting points in the control flow, when scatters or
|
||||
gathers happen.)
|
||||
|
||||
::
|
||||
|
||||
extern "C" {
|
||||
void ISPCInstrument(const char *fn, const char *note,
|
||||
int line, uint64_t mask);
|
||||
}
|
||||
|
||||
This function is passed the file name of the ``ispc`` file running, a short
|
||||
note indicating what is happening, the line number in the source file, and
|
||||
the current mask of active program instances in the gang. You must provide an
|
||||
implementation of this function and link it in with your application.
|
||||
|
||||
For example, when the ``ispc`` program runs, this function might be called
|
||||
as follows:
|
||||
|
||||
::
|
||||
|
||||
ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
|
||||
|
||||
This call indicates that at the currently executing program has just
|
||||
entered the function defined at line 55 of the file ``foo.ispc``, with a
|
||||
mask of all lanes currently executing (assuming a four-wide gang size
|
||||
target machine).
|
||||
|
||||
For a fuller example of the utility of this functionality, see
|
||||
``examples/aobench_instrumented`` in the ``ispc`` distribution. This
|
||||
example includes an implementation of the ``ISPCInstrument()`` function
|
||||
that collects aggregate data about the program's execution behavior.
|
||||
|
||||
When running this example, you will want to direct to the ``ao`` executable
|
||||
to generate a low resolution image, because the instrumentation adds
|
||||
substantial execution overhead. For example:
|
||||
|
||||
::
|
||||
|
||||
% ./ao 1 32 32
|
||||
|
||||
After the ``ao`` program exits, a summary report along the following lines
|
||||
will be printed. In the first few lines, you can see how many times a few
|
||||
functions were called, and the average percentage of SIMD lanes that were
|
||||
active upon function entry.
|
||||
|
||||
::
|
||||
|
||||
ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
|
||||
ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
|
||||
ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
|
||||
ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
|
||||
ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes
|
||||
ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
|
||||
...
|
||||
|
||||
|
||||
Choosing A Target Vector Width
|
||||
------------------------------
|
||||
|
||||
By default, ``ispc`` compiles to the natural vector width of the target
|
||||
instruction set. For example, for SSE2 and SSE4, it compiles four-wide,
|
||||
and for AVX, it complies 8-wide. For some programs, higher performance may
|
||||
be seen if the program is compiled to a doubled vector width--8-wide for
|
||||
SSE and 16-wide for AVX.
|
||||
|
||||
For workloads that don't require many of registers, this method can lead to
|
||||
significantly more efficient execution thanks to greater instruction level
|
||||
parallelism and amortization of various overhead over more program
|
||||
instances. For other workloads, it may lead to a slowdown due to higher
|
||||
register pressure; trying both approaches for key kernels may be
|
||||
worthwhile.
|
||||
|
||||
This option is only available for each of the SSE2, SSE4 and AVX targets.
|
||||
It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and
|
||||
``--target=avx-x2`` options, respectively.
|
||||
|
||||
|
||||
Disclaimer and Legal Information
|
||||
================================
|
||||
|
||||
INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS.
|
||||
NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL
|
||||
PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS
|
||||
AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER,
|
||||
AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE
|
||||
OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A
|
||||
PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT
|
||||
OR OTHER INTELLECTUAL PROPERTY RIGHT.
|
||||
|
||||
UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED
|
||||
NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD
|
||||
CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR.
|
||||
|
||||
Intel may make changes to specifications and product descriptions at any time,
|
||||
without notice. Designers must not rely on the absence or characteristics of any
|
||||
features or instructions marked "reserved" or "undefined." Intel reserves these
|
||||
for future definition and shall have no responsibility whatsoever for conflicts
|
||||
or incompatibilities arising from future changes to them. The information here
|
||||
is subject to change without notice. Do not finalize a design with this
|
||||
information.
|
||||
|
||||
The products described in this document may contain design defects or errors
|
||||
known as errata which may cause the product to deviate from published
|
||||
specifications. Current characterized errata are available on request.
|
||||
|
||||
Contact your local Intel sales office or your distributor to obtain the latest
|
||||
specifications and before placing your product order.
|
||||
|
||||
Copies of documents which have an order number and are referenced in this
|
||||
document, or other Intel literature, may be obtained by calling 1-800-548-4725,
|
||||
or by visiting Intel's Web Site.
|
||||
|
||||
Intel processor numbers are not a measure of performance. Processor numbers
|
||||
differentiate features within each processor family, not across different
|
||||
processor families. See http://www.intel.com/products/processor_number for
|
||||
details.
|
||||
|
||||
BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom,
|
||||
Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile,
|
||||
i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4,
|
||||
IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside,
|
||||
Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst,
|
||||
Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep,
|
||||
Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium,
|
||||
Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside,
|
||||
skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon,
|
||||
and Xeon Inside are trademarks of Intel Corporation in the U.S. and other
|
||||
countries.
|
||||
|
||||
* Other names and brands may be claimed as the property of others.
|
||||
|
||||
Copyright(C) 2011, Intel Corporation. All rights reserved.
|
||||
|
||||
|
||||
Optimization Notice
|
||||
===================
|
||||
|
||||
Intel compilers, associated libraries and associated development tools may
|
||||
include or utilize options that optimize for instruction sets that are
|
||||
available in both Intel and non-Intel microprocessors (for example SIMD
|
||||
instruction sets), but do not optimize equally for non-Intel
|
||||
microprocessors. In addition, certain compiler options for Intel
|
||||
compilers, including some that are not specific to Intel
|
||||
micro-architecture, are reserved for Intel microprocessors. For a detailed
|
||||
description of Intel compiler options, including the instruction sets and
|
||||
specific microprocessors they implicate, please refer to the "Intel
|
||||
Compiler User and Reference Guides" under "Compiler Options." Many library
|
||||
routines that are part of Intel compiler products are more highly optimized
|
||||
for Intel microprocessors than for other microprocessors. While the
|
||||
compilers and libraries in Intel compiler products offer optimizations for
|
||||
both Intel and Intel-compatible microprocessors, depending on the options
|
||||
you select, your code and other factors, you likely will get extra
|
||||
performance on Intel microprocessors.
|
||||
|
||||
Intel compilers, associated libraries and associated development tools may
|
||||
or may not optimize to the same degree for non-Intel microprocessors for
|
||||
optimizations that are not unique to Intel microprocessors. These
|
||||
optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2),
|
||||
Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental
|
||||
Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other
|
||||
optimizations. Intel does not guarantee the availability, functionality,
|
||||
or effectiveness of any optimization on microprocessors not manufactured by
|
||||
Intel. Microprocessor-dependent optimizations in this product are intended
|
||||
for use with Intel microprocessors.
|
||||
|
||||
While Intel believes our compilers and libraries are excellent choices to
|
||||
assist in obtaining the best performance on Intel and non-Intel
|
||||
microprocessors, Intel recommends that you evaluate other compilers and
|
||||
libraries to determine which best meet your requirements. We hope to win
|
||||
your business by striving to offer the best performance of any compiler or
|
||||
library; please let us know if you find we do not.
|
||||
|
||||
66
docs/template-news.txt
Normal file
66
docs/template-news.txt
Normal file
@@ -0,0 +1,66 @@
|
||||
%(head_prefix)s
|
||||
%(head)s
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
%(stylesheet)s
|
||||
%(body_prefix)s
|
||||
<div id="wrap">
|
||||
<div id="wrap2">
|
||||
<div id="header">
|
||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
||||
the CPU</div>
|
||||
</div>
|
||||
<div id="nav">
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li id="selected"><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li><a href="documentation.html">Documentation</a></li>
|
||||
<li><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div id="content-wrap">
|
||||
<div id="sidebar">
|
||||
<div class="widgetspace">
|
||||
<h1>Resources</h1>
|
||||
<ul class="menu">
|
||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
||||
users mailing list</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%(body_pre_docinfo)s
|
||||
%(docinfo)s
|
||||
<div id="content">
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
%(body_suffix)s
|
||||
66
docs/template-perf.txt
Normal file
66
docs/template-perf.txt
Normal file
@@ -0,0 +1,66 @@
|
||||
%(head_prefix)s
|
||||
%(head)s
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
%(stylesheet)s
|
||||
%(body_prefix)s
|
||||
<div id="wrap">
|
||||
<div id="wrap2">
|
||||
<div id="header">
|
||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
||||
the CPU</div>
|
||||
</div>
|
||||
<div id="nav">
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li><a href="documentation.html">Documentation</a></li>
|
||||
<li id="selected"><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div id="content-wrap">
|
||||
<div id="sidebar">
|
||||
<div class="widgetspace">
|
||||
<h1>Resources</h1>
|
||||
<ul class="menu">
|
||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
||||
users mailing list</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%(body_pre_docinfo)s
|
||||
%(docinfo)s
|
||||
<div id="content">
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
%(body_suffix)s
|
||||
66
docs/template.txt
Normal file
66
docs/template.txt
Normal file
@@ -0,0 +1,66 @@
|
||||
%(head_prefix)s
|
||||
%(head)s
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
%(stylesheet)s
|
||||
%(body_prefix)s
|
||||
<div id="wrap">
|
||||
<div id="wrap2">
|
||||
<div id="header">
|
||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
||||
the CPU</div>
|
||||
</div>
|
||||
<div id="nav">
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li><a href="news.html">News</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li id="selected"><a href="documentation.html">Documentation</a></li>
|
||||
<li><a href="perf.html">Performance</a></li>
|
||||
<li><a href="contrib.html">Contributors</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div id="content-wrap">
|
||||
<div id="sidebar">
|
||||
<div class="widgetspace">
|
||||
<h1>Resources</h1>
|
||||
<ul class="menu">
|
||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
||||
users mailing list</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%(body_pre_docinfo)s
|
||||
%(docinfo)s
|
||||
<div id="content">
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
%(body_suffix)s
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.0.7
|
||||
PROJECT_NUMBER = 1.3.0
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
@@ -585,7 +585,6 @@ INPUT = builtins.h \
|
||||
ctx.h \
|
||||
decl.h \
|
||||
expr.h \
|
||||
gatherbuf.h \
|
||||
ispc.h \
|
||||
llvmutil.h \
|
||||
module.h \
|
||||
@@ -598,7 +597,6 @@ INPUT = builtins.h \
|
||||
ctx.cpp \
|
||||
decl.cpp \
|
||||
expr.cpp \
|
||||
gatherbuf.cpp \
|
||||
ispc.cpp \
|
||||
llvmutil.cpp \
|
||||
main.cpp \
|
||||
|
||||
@@ -13,6 +13,7 @@ against regular serial C++ implementations, printing out a comparison of
|
||||
the runtimes and the speedup delivered by ispc. It may be instructive to
|
||||
do a side-by-side diff of the C++ and ispc implementations of these
|
||||
algorithms to learn more about wirting ispc code.
|
||||
|
||||
|
||||
AOBench
|
||||
=======
|
||||
@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
|
||||
(xres x yres) image each time and measuring the computation time with both
|
||||
serial and ispc implementations.
|
||||
|
||||
|
||||
AOBench_Instrumented
|
||||
====================
|
||||
|
||||
@@ -37,8 +39,39 @@ example implementation of this function that counts the number of times the
|
||||
callback is made and records some statistics about control flow coherence
|
||||
is provided in the instrument.cpp file.
|
||||
|
||||
*** Note: on Linux, this example currently hits an assertion in LLVM during
|
||||
*** compilation
|
||||
|
||||
Deferred
|
||||
========
|
||||
|
||||
This example shows an extensive example of using ispc for efficient
|
||||
deferred shading of scenes with thousands of lights; it's an implementation
|
||||
of the algorithm that Johan Andersson described at SIGGRAPH 2009,
|
||||
implemented by Andrew Lauritzen and Jefferson Montgomery. The basic idea
|
||||
is that a pre-rendered G-buffer is partitioned into tiles, and in each
|
||||
tile, the set of lights that contribute to the tile is first computed.
|
||||
Then, the pixels in the tile are then shaded using just those light
|
||||
sources. (See slides 19-29 of
|
||||
http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
|
||||
for more details on the algorithm.)
|
||||
|
||||
This directory includes three implementations of the algorithm:
|
||||
|
||||
- An ispc implementation that first does a static partitioning of the
|
||||
screen into tiles to parallelize across the CPU cores. Within each tile
|
||||
ispc kernels provide highly efficient implementations of the light
|
||||
culling and shading calculations.
|
||||
- A "best practices" serial C++ implementation. This implementation does a
|
||||
dynamic partitioning of the screen, refining tiles with significant Z
|
||||
depth complexity (these tiles often have a large number of lights that
|
||||
affect them). Within each final tile, the pixels are shaded using
|
||||
regular C++ code.
|
||||
- If the Cilk extensions are available in your compiler, an ispc
|
||||
implementation that uses Cilk will also be built.
|
||||
(See http://software.intel.com/en-us/articles/intel-cilk-plus/). Like
|
||||
the "best practices" serial implementation, this version does dynamic
|
||||
tile partitioning for better load balancing and then uses ispc for the
|
||||
light culling and shading.
|
||||
|
||||
|
||||
Mandelbrot
|
||||
==========
|
||||
@@ -46,6 +79,7 @@ Mandelbrot
|
||||
Mandelbrot set generation. This example is extensively documented at the
|
||||
http://ispc.github.com/example.html page.
|
||||
|
||||
|
||||
Mandelbrot_tasks
|
||||
================
|
||||
|
||||
@@ -58,6 +92,7 @@ using tasks with ispc, no task system is mandated; the user is free to plug
|
||||
in any task system they want, for ease of interoperating with existing task
|
||||
systems.
|
||||
|
||||
|
||||
Noise
|
||||
=====
|
||||
|
||||
@@ -71,6 +106,14 @@ Options
|
||||
This program implements both the Black-Scholes and Binomial options pricing
|
||||
models in both ispc and regular serial C++ code.
|
||||
|
||||
|
||||
Perfbench
|
||||
=========
|
||||
|
||||
This runs a number of microbenchmarks to measure system performance and
|
||||
code generation quality.
|
||||
|
||||
|
||||
RT
|
||||
==
|
||||
|
||||
@@ -87,6 +130,7 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
|
||||
"Physically Based Rendering" book for more about the basic algorithmic
|
||||
details.
|
||||
|
||||
|
||||
Simple
|
||||
======
|
||||
|
||||
@@ -94,6 +138,7 @@ This is a simple "hello world" type program that shows a ~10 line
|
||||
application program calling out to a ~5 line ispc program to do a simple
|
||||
computation.
|
||||
|
||||
|
||||
Volume
|
||||
======
|
||||
|
||||
|
||||
@@ -1,26 +1,7 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --fast-math --arch=x86-64
|
||||
EXAMPLE=ao
|
||||
CPP_SRC=ao.cpp ao_serial.cpp
|
||||
ISPC_SRC=ao.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx
|
||||
|
||||
default: ao
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ ao
|
||||
|
||||
ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/ao.o: objs/ao_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -55,7 +55,6 @@
|
||||
using namespace ispc;
|
||||
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
@@ -105,38 +104,6 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 4) {
|
||||
@@ -151,8 +118,6 @@ int main(int argc, char **argv)
|
||||
height = atoi (argv[3]);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
@@ -173,10 +138,30 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
// Report results and save image
|
||||
printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC,
|
||||
width, height);
|
||||
printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n",
|
||||
minTimeISPC, width, height);
|
||||
savePPM("ao-ispc.ppm", width, height);
|
||||
|
||||
//
|
||||
// Run the ispc + tasks path, test_iterations times, and report the
|
||||
// minimum time for any of them.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (unsigned int i = 0; i < test_iterations; i++) {
|
||||
memset((void *)fimg, 0, sizeof(float) * width * height * 3);
|
||||
assert(NSUBSAMPLES == 2);
|
||||
|
||||
reset_and_start_timer();
|
||||
ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
|
||||
double t = get_elapsed_mcycles();
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, t);
|
||||
}
|
||||
|
||||
// Report results and save image
|
||||
printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n",
|
||||
minTimeISPCTasks, width, height);
|
||||
savePPM("ao-ispc-tasks.ppm", width, height);
|
||||
|
||||
//
|
||||
// Run the serial path, again test_iteration times, and report the
|
||||
// minimum time.
|
||||
@@ -193,7 +178,8 @@ int main(int argc, char **argv)
|
||||
// Report more results, save another image...
|
||||
printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial,
|
||||
width, height);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||
savePPM("ao-serial.ppm", width, height);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -50,7 +50,6 @@ struct Isect {
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
@@ -75,16 +74,15 @@ static inline vec vcross(vec v0, vec v1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vnormalize(reference vec v) {
|
||||
static inline void vnormalize(vec &v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v *= invlen;
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
ray_plane_intersect(reference Isect isect, reference Ray ray,
|
||||
reference Plane plane) {
|
||||
static void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
@@ -104,8 +102,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(reference Isect isect, reference Ray ray,
|
||||
reference Sphere sphere) {
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
@@ -126,8 +123,8 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
orthoBasis(reference vec basis[3], vec n) {
|
||||
static void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
|
||||
@@ -149,9 +146,9 @@ orthoBasis(reference vec basis[3], vec n) {
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
reference Sphere spheres[3], reference RNGState rngstate) {
|
||||
static float
|
||||
ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
vec basis[3];
|
||||
@@ -168,8 +165,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(rngstate);
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrt(1.0 - theta * theta);
|
||||
@@ -203,109 +200,55 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||
of width w and height h.
|
||||
*/
|
||||
void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
uniform int nsubsamples, reference uniform float image[]) {
|
||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static Sphere spheres[3] = {
|
||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static uniform Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(rngstate, y0);
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
|
||||
// Compute the mapping between the 'programCount'-wide program
|
||||
// instances running in parallel and samples in the image.
|
||||
//
|
||||
// For now, we'll always take four samples per pixel, so start by
|
||||
// initializing du and dv with offsets into subpixel samples. We'll
|
||||
// take care of further updating du and dv for the case where we're
|
||||
// doing more than 4 program instances in parallel shortly.
|
||||
uniform float uSteps[4] = { 0, 1, 0, 1 };
|
||||
uniform float vSteps[4] = { 0, 0, 1, 1 };
|
||||
float du = uSteps[programIndex % 4] / nsubsamples;
|
||||
float dv = vSteps[programIndex % 4] / nsubsamples;
|
||||
foreach_tiled(y = y0 ... y1, x = 0 ... w,
|
||||
u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Now handle the case where we are able to do more than one pixel's
|
||||
// worth of work at once. nx records the number of pixels in the x
|
||||
// direction we do per iteration and ny the number in y.
|
||||
uniform int nx = 1, ny = 1;
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
|
||||
if (programCount == 8) {
|
||||
// Do two pixels at once in the x direction
|
||||
nx = 2;
|
||||
if (programIndex >= 4)
|
||||
// And shift the offsets for the second pixel's worth of work
|
||||
++du;
|
||||
}
|
||||
else if (programCount == 16) {
|
||||
// Two at once in both x and y
|
||||
nx = ny = 2;
|
||||
if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
|
||||
++du;
|
||||
if (programIndex >= 8)
|
||||
++dv;
|
||||
}
|
||||
ray.org = 0.f;
|
||||
|
||||
// Now loop over all of the pixels, stepping in x and y as calculated
|
||||
// above. (Assumes that ny divides y and nx divides x...)
|
||||
for (uniform int y = y0; y < y1; y += ny) {
|
||||
for (uniform int x = 0; x < w; x += nx) {
|
||||
// Figur out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
ray.org = 0.f;
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
cif (isect.hit) {
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
cif (isect.hit)
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
|
||||
// This is a little grungy; we have results for
|
||||
// programCount-worth of values. Because we're doing 2x2
|
||||
// subsamples, we need to peel them off in groups of four,
|
||||
// average the four values for each pixel, and update the
|
||||
// output image.
|
||||
//
|
||||
// Store the varying value to a uniform array of the same size.
|
||||
// See the discussion about communication among program
|
||||
// instances in the ispc user's manual for more discussion on
|
||||
// this idiom.
|
||||
uniform float retArray[programCount];
|
||||
retArray[programIndex] = ret;
|
||||
|
||||
// offset to the first pixel in the image
|
||||
uniform int offset = 3 * (y * w + x);
|
||||
for (uniform int p = 0; p < programCount; p += 4, ++offset) {
|
||||
// Get the four sample values for this pixel
|
||||
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
||||
retArray[p+3];
|
||||
|
||||
// Normalize by number of samples taken
|
||||
sumret /= nsubsamples * nsubsamples;
|
||||
|
||||
// Store result in the image
|
||||
image[offset+0] = sumret;
|
||||
image[offset+1] = sumret;
|
||||
image[offset+2] = sumret;
|
||||
}
|
||||
int offset = 3 * (y * w + x);
|
||||
atomic_add_local(&image[offset], ret);
|
||||
atomic_add_local(&image[offset+1], ret);
|
||||
atomic_add_local(&image[offset+2], ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -315,3 +258,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
ao_scanlines(0, h, w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
static void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[]) {
|
||||
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
launch[h] ao_task(w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
27
examples/aobench/aobench.vcxproj
Executable file → Normal file
27
examples/aobench/aobench.vcxproj
Executable file → Normal file
@@ -21,22 +21,23 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ao.cpp" />
|
||||
<ClCompile Include="ao_serial.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
@@ -85,15 +86,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -102,6 +107,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -117,6 +123,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -134,6 +141,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -152,6 +160,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -164,4 +173,4 @@
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
@@ -2,7 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -g3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
|
||||
ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
|
||||
|
||||
default: ao
|
||||
|
||||
@@ -14,13 +14,13 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ ao
|
||||
|
||||
ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
|
||||
ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread
|
||||
|
||||
objs/%.o: %.cpp
|
||||
objs/%.o: %.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/ao.o: objs/ao_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h
|
||||
|
||||
@@ -32,7 +32,6 @@
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
@@ -51,12 +50,11 @@
|
||||
#include <algorithm>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "ao_ispc.h"
|
||||
#include "ao_instrumented_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
#include "instrument.h"
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
@@ -104,37 +102,6 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
@@ -150,8 +117,6 @@ int main(int argc, char **argv)
|
||||
height = atoi (argv[3]);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vnormalize(reference vec v) {
|
||||
static inline void vnormalize(vec &v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v *= invlen;
|
||||
@@ -83,8 +83,7 @@ static inline void vnormalize(reference vec v) {
|
||||
|
||||
|
||||
static inline void
|
||||
ray_plane_intersect(reference Isect isect, reference Ray ray,
|
||||
reference Plane plane) {
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
@@ -104,8 +103,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(reference Isect isect, reference Ray ray,
|
||||
reference Sphere sphere) {
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
@@ -127,7 +125,7 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
|
||||
|
||||
|
||||
static inline void
|
||||
orthoBasis(reference vec basis[3], vec n) {
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
|
||||
@@ -150,8 +148,8 @@ orthoBasis(reference vec basis[3], vec n) {
|
||||
|
||||
|
||||
static inline float
|
||||
ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
reference Sphere spheres[3], reference RNGState rngstate) {
|
||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
vec basis[3];
|
||||
@@ -168,8 +166,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(rngstate);
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrt(1.0 - theta * theta);
|
||||
@@ -203,8 +201,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||
of width w and height h.
|
||||
*/
|
||||
void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
uniform int nsubsamples, reference uniform float image[]) {
|
||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
@@ -212,7 +211,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(rngstate, y0);
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
|
||||
// Compute the mapping between the 'programCount'-wide program
|
||||
// instances running in parallel and samples in the image.
|
||||
@@ -231,6 +230,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
// direction we do per iteration and ny the number in y.
|
||||
uniform int nx = 1, ny = 1;
|
||||
|
||||
// FIXME: We actually need ny to be 1 regardless of the decomposition,
|
||||
// since the task decomposition is one scanline high.
|
||||
|
||||
if (programCount == 8) {
|
||||
// Do two pixels at once in the x direction
|
||||
nx = 2;
|
||||
@@ -239,19 +241,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
++du;
|
||||
}
|
||||
else if (programCount == 16) {
|
||||
// Two at once in both x and y
|
||||
nx = ny = 2;
|
||||
if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
|
||||
nx = 4;
|
||||
ny = 1;
|
||||
if (programIndex >= 4 && programIndex < 8)
|
||||
++du;
|
||||
if (programIndex >= 8)
|
||||
++dv;
|
||||
if (programIndex >= 8 && programIndex < 12)
|
||||
du += 2;
|
||||
if (programIndex >= 12)
|
||||
du += 3;
|
||||
}
|
||||
|
||||
// Now loop over all of the pixels, stepping in x and y as calculated
|
||||
// above. (Assumes that ny divides y and nx divides x...)
|
||||
for (uniform int y = y0; y < y1; y += ny) {
|
||||
for (uniform int x = 0; x < w; x += nx) {
|
||||
// Figur out x,y pixel in NDC
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
@@ -293,7 +297,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
|
||||
// offset to the first pixel in the image
|
||||
uniform int offset = 3 * (y * w + x);
|
||||
for (uniform int p = 0; p < programCount; p += 4, ++offset) {
|
||||
for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
|
||||
// Get the four sample values for this pixel
|
||||
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
||||
retArray[p+3];
|
||||
@@ -315,3 +319,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
ao_scanlines(0, h, w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
static void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[]) {
|
||||
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
launch[h] ao_task(w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
39
examples/aobench_instrumented/aobench_instrumented.vcxproj
Executable file → Normal file
39
examples/aobench_instrumented/aobench_instrumented.vcxproj
Executable file → Normal file
@@ -21,22 +21,23 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ao.cpp" />
|
||||
<ClCompile Include="instrument.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
@@ -85,15 +86,23 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -101,7 +110,8 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -114,7 +124,8 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -129,7 +140,8 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -146,7 +158,8 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -158,4 +171,4 @@
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
65
examples/common.mk
Normal file
65
examples/common.mk
Normal file
@@ -0,0 +1,65 @@
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=tasksys.o
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O2 -m64
|
||||
LIBS=-lm $(TASK_LIB) -lstdc++
|
||||
ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
|
||||
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
|
||||
$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
|
||||
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
|
||||
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
|
||||
|
||||
default: $(EXAMPLE)
|
||||
|
||||
all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
objs/%.cpp objs/%.o objs/%.h: dirs
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
|
||||
|
||||
$(EXAMPLE): $(CPP_OBJS) $(ISPC_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/%.o: %.cpp dirs $(ISPC_HEADER)
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
|
||||
$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
|
||||
$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
|
||||
$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
|
||||
$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
|
||||
$(ISPC) $< -o $@ --target=generic-1
|
||||
|
||||
$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
8
examples/deferred/Makefile
Normal file
8
examples/deferred/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=deferred_shading
|
||||
CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
|
||||
ISPC_SRC=kernels.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_FLAGS=--opt=fast-math
|
||||
|
||||
include ../common.mk
|
||||
210
examples/deferred/common.cpp
Normal file
210
examples/deferred/common.cpp
Normal file
@@ -0,0 +1,210 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "../timing.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Framebuffer::Framebuffer(int width, int height) {
|
||||
nPixels = width*height;
|
||||
r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
}
|
||||
|
||||
|
||||
Framebuffer::~Framebuffer() {
|
||||
lAlignedFree(r);
|
||||
lAlignedFree(g);
|
||||
lAlignedFree(b);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Framebuffer::clear() {
|
||||
memset(r, 0, nPixels);
|
||||
memset(g, 0, nPixels);
|
||||
memset(b, 0, nPixels);
|
||||
}
|
||||
|
||||
|
||||
InputData *
|
||||
CreateInputDataFromFile(const char *path) {
|
||||
FILE *in = fopen(path, "rb");
|
||||
if (!in) return 0;
|
||||
|
||||
InputData *input = new InputData;
|
||||
|
||||
// Load header
|
||||
if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Load data chunk and update pointers
|
||||
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
||||
ALIGNMENT_BYTES);
|
||||
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
input->arrays.zBuffer =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
|
||||
input->arrays.normalEncoded_x =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
|
||||
input->arrays.normalEncoded_y =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
|
||||
input->arrays.specularAmount =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
|
||||
input->arrays.specularPower =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
|
||||
input->arrays.albedo_x =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
|
||||
input->arrays.albedo_y =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
|
||||
input->arrays.albedo_z =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
|
||||
input->arrays.lightPositionView_x =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
|
||||
input->arrays.lightPositionView_y =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
|
||||
input->arrays.lightPositionView_z =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
|
||||
input->arrays.lightAttenuationBegin =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
|
||||
input->arrays.lightColor_x =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
|
||||
input->arrays.lightColor_y =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
|
||||
input->arrays.lightColor_z =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
|
||||
input->arrays.lightAttenuationEnd =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
|
||||
|
||||
fclose(in);
|
||||
return input;
|
||||
}
|
||||
|
||||
|
||||
void DeleteInputData(InputData *input) {
|
||||
lAlignedFree(input->chunk);
|
||||
}
|
||||
|
||||
|
||||
void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer) {
|
||||
// Deswizzle and copy to RGBA output
|
||||
// Doesn't need to be fast... only happens once
|
||||
size_t imageBytes = 3 * input->header.framebufferWidth *
|
||||
input->header.framebufferHeight;
|
||||
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
|
||||
memset(framebufferAOS, 0, imageBytes);
|
||||
|
||||
for (int i = 0; i < input->header.framebufferWidth *
|
||||
input->header.framebufferHeight; ++i) {
|
||||
framebufferAOS[3 * i + 0] = framebuffer.r[i];
|
||||
framebufferAOS[3 * i + 1] = framebuffer.g[i];
|
||||
framebufferAOS[3 * i + 2] = framebuffer.b[i];
|
||||
}
|
||||
|
||||
// Write out simple PPM file
|
||||
FILE *out = fopen(filename, "wb");
|
||||
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
fwrite(framebufferAOS, imageBytes, 1, out);
|
||||
fclose(out);
|
||||
|
||||
lAlignedFree(framebufferAOS);
|
||||
}
|
||||
BIN
examples/deferred/data/pp1280x720.bin
Normal file
BIN
examples/deferred/data/pp1280x720.bin
Normal file
Binary file not shown.
BIN
examples/deferred/data/pp1920x1200.bin
Normal file
BIN
examples/deferred/data/pp1920x1200.bin
Normal file
Binary file not shown.
108
examples/deferred/deferred.h
Normal file
108
examples/deferred/deferred.h
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DEFERRED_H
|
||||
#define DEFERRED_H
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
#define MAX_LIGHTS 1024
|
||||
|
||||
enum InputDataArraysEnum {
|
||||
idaZBuffer = 0,
|
||||
idaNormalEncoded_x,
|
||||
idaNormalEncoded_y,
|
||||
idaSpecularAmount,
|
||||
idaSpecularPower,
|
||||
idaAlbedo_x,
|
||||
idaAlbedo_y,
|
||||
idaAlbedo_z,
|
||||
idaLightPositionView_x,
|
||||
idaLightPositionView_y,
|
||||
idaLightPositionView_z,
|
||||
idaLightAttenuationBegin,
|
||||
idaLightColor_x,
|
||||
idaLightColor_y,
|
||||
idaLightColor_z,
|
||||
idaLightAttenuationEnd,
|
||||
|
||||
idaNum
|
||||
};
|
||||
|
||||
#ifndef ISPC
|
||||
|
||||
#include <stdint.h>
|
||||
#include "kernels_ispc.h"
|
||||
|
||||
#define ALIGNMENT_BYTES 64
|
||||
|
||||
#define MAX_LIGHTS 1024
|
||||
|
||||
#define VISUALIZE_LIGHT_COUNT 0
|
||||
|
||||
struct InputData
|
||||
{
|
||||
ispc::InputHeader header;
|
||||
ispc::InputDataArrays arrays;
|
||||
uint8_t *chunk;
|
||||
};
|
||||
|
||||
|
||||
struct Framebuffer {
|
||||
Framebuffer(int width, int height);
|
||||
~Framebuffer();
|
||||
|
||||
void clear();
|
||||
|
||||
uint8_t *r, *g, *b;
|
||||
|
||||
private:
|
||||
int nPixels;
|
||||
Framebuffer(const Framebuffer &);
|
||||
Framebuffer &operator=(const Framebuffer *);
|
||||
};
|
||||
|
||||
|
||||
InputData *CreateInputDataFromFile(const char *path);
|
||||
void DeleteInputData(InputData *input);
|
||||
void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer);
|
||||
void InitDynamicC(InputData *input);
|
||||
void InitDynamicCilk(InputData *input);
|
||||
void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
|
||||
void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
|
||||
|
||||
#endif // !ISPC
|
||||
|
||||
#endif // DEFERRED_H
|
||||
178
examples/deferred/deferred_shading.vcxproj
Executable file
178
examples/deferred/deferred_shading.vcxproj
Executable file
@@ -0,0 +1,178 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>mandelbrot</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="common.cpp" />
|
||||
<ClCompile Include="dynamic_c.cpp" />
|
||||
<ClCompile Include="dynamic_cilk.cpp" />
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="kernels.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
870
examples/deferred/dynamic_c.cpp
Normal file
870
examples/deferred/dynamic_c.cpp
Normal file
@@ -0,0 +1,870 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
|
||||
|
||||
#define DYNAMIC_TREE_LEVELS 5
|
||||
// If this is set to 1 then the result will be identical to the static version
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBounds(int tileStartX, int tileEndX,
|
||||
int tileStartY, int tileEndY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float *minZ, float *maxZ)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (int y = tileStartY; y < tileEndY; ++y) {
|
||||
for (int x = tileStartX; x < tileEndX; ++x) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[(y * gBufferWidth + x)];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = std::min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
*minZ = laneMinZ;
|
||||
*maxZ = laneMaxZ;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
||||
int numTilesX, int numTilesY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float minZArray[],
|
||||
float maxZArray[])
|
||||
{
|
||||
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
float minZ, maxZ;
|
||||
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
|
||||
cameraNear, cameraFar, &minZ, &maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class MinMaxZTree
|
||||
{
|
||||
public:
|
||||
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||
// Levels must be small enough that neither dimension goes below one tile
|
||||
MinMaxZTree(
|
||||
int tileWidth, int tileHeight, int levels,
|
||||
int gBufferWidth, int gBufferHeight)
|
||||
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
int x = NumTilesX(i);
|
||||
int y = NumTilesY(i);
|
||||
assert(x > 0);
|
||||
assert(y > 0);
|
||||
// NOTE: If the following two asserts fire it probably means that
|
||||
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar)
|
||||
{
|
||||
for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||
ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||
zBuffer, gBufferPitchInElements,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||
}
|
||||
|
||||
// Generate other levels
|
||||
for (int level = 1; level < mLevels; ++level) {
|
||||
int destTilesX = NumTilesX(level);
|
||||
int destTilesY = NumTilesY(level);
|
||||
int srcLevel = level - 1;
|
||||
int srcTilesX = NumTilesX(srcLevel);
|
||||
int srcTilesY = NumTilesY(srcLevel);
|
||||
for (int y = 0; y < destTilesY; ++y) {
|
||||
for (int x = 0; x < destTilesX; ++x) {
|
||||
int srcX = x << 1;
|
||||
int srcY = y << 1;
|
||||
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||
// TODO: SSE branchless min/max is probably better...
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
}
|
||||
}
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
}
|
||||
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MinMaxZTree() {
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
lAlignedFree(mMinZArrays[i]);
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
|
||||
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||
// TODO: Verify this...
|
||||
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||
|
||||
float MinZ(int level, int tileX, int tileY) const {
|
||||
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
float MaxZ(int level, int tileX, int tileY) const {
|
||||
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
|
||||
private:
|
||||
int mTileWidth;
|
||||
int mTileHeight;
|
||||
int mLevels;
|
||||
int mNumTilesX;
|
||||
int mNumTilesY;
|
||||
|
||||
// One array for each "level" in the tree
|
||||
float **mMinZArrays;
|
||||
float **mMaxZArrays;
|
||||
};
|
||||
|
||||
static MinMaxZTree *gMinMaxZTree = 0;
|
||||
|
||||
void InitDynamicC(InputData *input) {
|
||||
gMinMaxZTree =
|
||||
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
/* We're going to split a tile into 4 sub-tiles. This function
|
||||
reclassifies the tile's lights with respect to the sub-tiles. */
|
||||
static void
|
||||
SplitTileMinMax(
|
||||
int tileMidX, int tileMidY,
|
||||
// Subtile data (00, 10, 01, 11)
|
||||
float subtileMinZ[],
|
||||
float subtileMaxZ[],
|
||||
// G-buffer data
|
||||
int gBufferWidth, int gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int lightIndices[],
|
||||
int numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
int subtileIndices[],
|
||||
int subtileIndicesPitch,
|
||||
int subtileNumLights[]
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
// Normalize
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
// Initialize
|
||||
int subtileLightOffset[4];
|
||||
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
for (int i = 0; i < numLights; ++i) {
|
||||
int lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again against subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
if (fabsf(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||
}
|
||||
if (fabsf(dy) > light_attenuationEnd) {
|
||||
bool positiveY = dy > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||
}
|
||||
|
||||
if (inFrustum[0])
|
||||
subtileIndices[subtileLightOffset[0]++] = lightIndex;
|
||||
if (inFrustum[1])
|
||||
subtileIndices[subtileLightOffset[1]++] = lightIndex;
|
||||
if (inFrustum[2])
|
||||
subtileIndices[subtileLightOffset[2]++] = lightIndex;
|
||||
if (inFrustum[3])
|
||||
subtileIndices[subtileLightOffset[3]++] = lightIndex;
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||
float n = 1.f / sqrtf(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
Unorm8ToFloat32(uint8_t u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline uint8_t
|
||||
Float32ToUnorm8(float f) {
|
||||
return (uint8_t)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
half_to_float_fast(uint16_t h) {
|
||||
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
||||
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
||||
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
uint32_t xs = ((uint32_t) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
uint32_t xe = (uint32_t) (xes << 23);
|
||||
// Mantissa
|
||||
uint32_t xm = ((uint32_t) hm) << 13;
|
||||
|
||||
uint32_t bits = (xs | xe | xm);
|
||||
float *fp = reinterpret_cast<float *>(&bits);
|
||||
return *fp;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeTileC(
|
||||
int32_t tileStartX, int32_t tileEndX,
|
||||
int32_t tileStartY, int32_t tileEndY,
|
||||
int32_t gBufferWidth, int32_t gBufferHeight,
|
||||
const ispc::InputDataArrays &inputData,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
// Light list
|
||||
int32_t tileLightIndices[],
|
||||
int32_t tileNumLights,
|
||||
// UI
|
||||
bool visualizeLightCount,
|
||||
// Output
|
||||
uint8_t framebuffer_r[],
|
||||
uint8_t framebuffer_g[],
|
||||
uint8_t framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t framebufferIndex = (y * gBufferWidth + x);
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrtf(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
int32_t lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrtf(distanceToLight2);
|
||||
|
||||
float distanceToLightRcp = 1.f / distanceToLight;
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
if (NdotL > 0.0f) {
|
||||
float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = std::min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = std::max(NdotH, 0.0f);
|
||||
|
||||
float specular = powf(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
|
||||
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
|
||||
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ShadeTileC(startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
--level;
|
||||
tileX <<= 1;
|
||||
tileY <<= 1;
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
|
||||
// Work out splitting coords
|
||||
int midX = (tileX + 1) * width;
|
||||
int midY = (tileY + 1) * height;
|
||||
|
||||
// Read subtile min/max data
|
||||
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||
// framebuffer sizes.
|
||||
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
if (rightTileExists) {
|
||||
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||
if (bottomTileExists) {
|
||||
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||
}
|
||||
}
|
||||
if (bottomTileExists) {
|
||||
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||
}
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
// Recurse into subtiles
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
subtileLightIndices[1], subtileNumLights[1],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||
subtileLightIndices[2], subtileNumLights[2],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||
subtileLightIndices[3], subtileNumLights[3],
|
||||
framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
IntersectLightsWithTileMinMax(
|
||||
int tileStartX, int tileEndX,
|
||||
int tileStartY, int tileEndY,
|
||||
// Tile data
|
||||
float minZ,
|
||||
float maxZ,
|
||||
// G-buffer data
|
||||
int gBufferWidth, int gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Output
|
||||
int tileLightIndices[]
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[4];
|
||||
float frustumPlanes_z[4];
|
||||
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
|
||||
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
|
||||
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
|
||||
frustumPlanes_xy_v[i] *= norm;
|
||||
frustumPlanes_z_v[i] *= norm;
|
||||
|
||||
frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
|
||||
frustumPlanes_z[i] = frustumPlanes_z_v[i];
|
||||
}
|
||||
|
||||
int tileNumLights = 0;
|
||||
|
||||
for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
if (!inFrustum)
|
||||
continue;
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
if (inFrustum)
|
||||
tileLightIndices[tileNumLights++] = lightIndex;
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// Get Z min/max for this tile
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int numLights = IntersectLightsWithTileMinMax(
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||
int rootTiles = rootTilesX * rootTilesY;
|
||||
for (int g = 0; g < rootTiles; ++g) {
|
||||
uint32_t tileY = g / rootTilesX;
|
||||
uint32_t tileX = g % rootTilesX;
|
||||
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||
}
|
||||
}
|
||||
398
examples/deferred/dynamic_cilk.cpp
Normal file
398
examples/deferred/dynamic_cilk.cpp
Normal file
@@ -0,0 +1,398 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __cilk
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
|
||||
|
||||
#define DYNAMIC_TREE_LEVELS 5
|
||||
// If this is set to 1 then the result will be identical to the static version
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
class MinMaxZTreeCilk
|
||||
{
|
||||
public:
|
||||
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||
// Levels must be small enough that neither dimension goes below one tile
|
||||
MinMaxZTreeCilk(
|
||||
int tileWidth, int tileHeight, int levels,
|
||||
int gBufferWidth, int gBufferHeight)
|
||||
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
int x = NumTilesX(i);
|
||||
int y = NumTilesY(i);
|
||||
assert(x > 0);
|
||||
assert(y > 0);
|
||||
// NOTE: If the following two asserts fire it probably means that
|
||||
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar)
|
||||
{
|
||||
// Compute level 0 in parallel. Outer loops is here since we use Cilk
|
||||
_Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||
ispc::ComputeZBoundsRow(tileY,
|
||||
mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||
zBuffer, gBufferPitchInElements,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||
}
|
||||
|
||||
// Generate other levels
|
||||
// NOTE: We currently don't use ispc here since it's sort of an
|
||||
// awkward gather-based reduction Using SSE odd pack/unpack
|
||||
// instructions might actually work here when we need to optimize
|
||||
for (int level = 1; level < mLevels; ++level) {
|
||||
int destTilesX = NumTilesX(level);
|
||||
int destTilesY = NumTilesY(level);
|
||||
int srcLevel = level - 1;
|
||||
int srcTilesX = NumTilesX(srcLevel);
|
||||
int srcTilesY = NumTilesY(srcLevel);
|
||||
_Cilk_for (int y = 0; y < destTilesY; ++y) {
|
||||
for (int x = 0; x < destTilesX; ++x) {
|
||||
int srcX = x << 1;
|
||||
int srcY = y << 1;
|
||||
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||
// TODO: SSE branchless min/max is probably better...
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
}
|
||||
}
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
}
|
||||
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MinMaxZTreeCilk() {
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
lAlignedFree(mMinZArrays[i]);
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
|
||||
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||
// TODO: Verify this...
|
||||
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||
|
||||
float MinZ(int level, int tileX, int tileY) const {
|
||||
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
float MaxZ(int level, int tileX, int tileY) const {
|
||||
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
|
||||
private:
|
||||
int mTileWidth;
|
||||
int mTileHeight;
|
||||
int mLevels;
|
||||
int mNumTilesX;
|
||||
int mNumTilesY;
|
||||
|
||||
// One array for each "level" in the tree
|
||||
float **mMinZArrays;
|
||||
float **mMaxZArrays;
|
||||
};
|
||||
|
||||
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
|
||||
|
||||
void InitDynamicCilk(InputData *input) {
|
||||
gMinMaxZTreeCilk =
|
||||
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ispc::ShadeTile(
|
||||
startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
&input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
--level;
|
||||
tileX <<= 1;
|
||||
tileY <<= 1;
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
|
||||
// Work out splitting coords
|
||||
int midX = (tileX + 1) * width;
|
||||
int midY = (tileY + 1) * height;
|
||||
|
||||
// Read subtile min/max data
|
||||
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||
// framebuffer sizes.
|
||||
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
if (rightTileExists) {
|
||||
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||
if (bottomTileExists) {
|
||||
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||
}
|
||||
}
|
||||
if (bottomTileExists) {
|
||||
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||
}
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
// Recurse into subtiles
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
subtileLightIndices[1], subtileNumLights[1],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||
subtileLightIndices[2], subtileNumLights[2],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||
subtileLightIndices[3], subtileNumLights[3],
|
||||
framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// Get Z min/max for this tile
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int numLights = ispc::IntersectLightsWithTileMinMax(
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
// Launch the "root" tiles. Ideally these should at least fill the
|
||||
// machine... at the moment we have a static number of "levels" to the
|
||||
// mip tree but it might make sense to compute it based on the width of
|
||||
// the machine.
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||
int rootTiles = rootTilesX * rootTilesY;
|
||||
_Cilk_for (int g = 0; g < rootTiles; ++g) {
|
||||
uint32_t tileY = g / rootTilesX;
|
||||
uint32_t tileX = g % rootTilesX;
|
||||
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __cilk
|
||||
672
examples/deferred/kernels.ispc
Normal file
672
examples/deferred/kernels.ispc
Normal file
@@ -0,0 +1,672 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
|
||||
struct InputDataArrays
|
||||
{
|
||||
float *zBuffer;
|
||||
unsigned int16 *normalEncoded_x; // half float
|
||||
unsigned int16 *normalEncoded_y; // half float
|
||||
unsigned int16 *specularAmount; // half float
|
||||
unsigned int16 *specularPower; // half float
|
||||
unsigned int8 *albedo_x; // unorm8
|
||||
unsigned int8 *albedo_y; // unorm8
|
||||
unsigned int8 *albedo_z; // unorm8
|
||||
float *lightPositionView_x;
|
||||
float *lightPositionView_y;
|
||||
float *lightPositionView_z;
|
||||
float *lightAttenuationBegin;
|
||||
float *lightColor_x;
|
||||
float *lightColor_y;
|
||||
float *lightColor_z;
|
||||
float *lightAttenuationEnd;
|
||||
};
|
||||
|
||||
struct InputHeader
|
||||
{
|
||||
float cameraProj[4][4];
|
||||
float cameraNear;
|
||||
float cameraFar;
|
||||
|
||||
int32 framebufferWidth;
|
||||
int32 framebufferHeight;
|
||||
int32 numLights;
|
||||
int32 inputDataChunkSize;
|
||||
int32 inputDataArrayOffsets[idaNum];
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Common utility routines
|
||||
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||
float n = rsqrt(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
Unorm8ToFloat32(unsigned int8 u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline unsigned int8
|
||||
Float32ToUnorm8(float f) {
|
||||
return (unsigned int8)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBounds(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
uniform int32 gBufferWidth,
|
||||
// Camera data
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
uniform float &minZ,
|
||||
uniform float &maxZ
|
||||
)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[y * gBufferWidth + x];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
minZ = reduce_min(laneMinZ);
|
||||
maxZ = reduce_max(laneMaxZ);
|
||||
}
|
||||
|
||||
|
||||
export uniform int32
|
||||
IntersectLightsWithTileMinMax(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
// Tile data
|
||||
uniform float minZ,
|
||||
uniform float maxZ,
|
||||
// G-buffer data
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
uniform float frustumPlanes_xy[4] = {
|
||||
-(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
uniform float frustumPlanes_z[4] = {
|
||||
tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for (uniform int i = 0; i < 4; ++i) {
|
||||
uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
uniform int32 tileNumLights = 0;
|
||||
|
||||
foreach (lightIndex = 0 ... numLights) {
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// This seems better than cif(!inFrustum) ccontinue; here since we
|
||||
// don't actually need to mask the rest of this function - this is
|
||||
// just a greedy early-out. Could also structure all of this as
|
||||
// nested if() statements, but this a bit easier to read
|
||||
if (any(inFrustum)) {
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
cif (inFrustum) {
|
||||
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
|
||||
lightIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
static uniform int32
|
||||
IntersectLightsWithTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Light Data
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float minZ, maxZ;
|
||||
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
|
||||
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
|
||||
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
|
||||
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
light_positionView_z_array, light_attenuationEnd_array,
|
||||
tileLightIndices);
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
ShadeTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
uniform InputDataArrays &inputData,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
// Light list
|
||||
uniform int32 tileLightIndices[],
|
||||
uniform int32 tileNumLights,
|
||||
// UI
|
||||
uniform bool visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
int32 framebufferIndex = (y * gBufferWidth + x);
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
int32 gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
uniform float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
uniform float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
uniform float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
uniform float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
cif (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrt(distanceToLight2);
|
||||
|
||||
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||
float distanceToLightRcp = rcp(distanceToLight);
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
cif (NdotL > 0.0f) {
|
||||
uniform float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = max(NdotH, 0.0f);
|
||||
|
||||
float specular = pow(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
uniform float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
uniform float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
uniform float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
// These pows are pretty slow right now, but we can do
|
||||
// something faster if really necessary to squeeze every
|
||||
// last bit of performance out of it
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
|
||||
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
|
||||
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Static decomposition
|
||||
|
||||
task void
|
||||
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
uniform InputHeader &inputHeader,
|
||||
uniform InputDataArrays &inputData,
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int32 group_y = taskIndex / num_groups_x;
|
||||
uniform int32 group_x = taskIndex % num_groups_x;
|
||||
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||
|
||||
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
||||
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
||||
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||
uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
|
||||
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||
|
||||
// Light intersection: figure out which lights illuminate this tile.
|
||||
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||
uniform int numTileLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight,
|
||||
inputData.zBuffer,
|
||||
cameraProj_00, cameraProj_11,
|
||||
cameraProj_22, cameraProj_32,
|
||||
inputHeader.cameraNear, inputHeader.cameraFar,
|
||||
MAX_LIGHTS,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightAttenuationEnd,
|
||||
tileLightIndices);
|
||||
|
||||
// And now shade the tile, using the lights in tileLightIndices
|
||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight, inputData,
|
||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
RenderStatic(uniform InputHeader &inputHeader,
|
||||
uniform InputDataArrays &inputData,
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||
uniform int num_groups = num_groups_x * num_groups_y;
|
||||
|
||||
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||
// by MIN_TILE_HEIGHT pixels.
|
||||
launch[num_groups] RenderTile(num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Routines for dynamic decomposition path
|
||||
|
||||
// This computes the z min/max range for a whole row worth of tiles.
|
||||
export void
|
||||
ComputeZBoundsRow(
|
||||
uniform int32 tileY,
|
||||
uniform int32 tileWidth, uniform int32 tileHeight,
|
||||
uniform int32 numTilesX, uniform int32 numTilesY,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
uniform int32 gBufferWidth,
|
||||
// Camera data
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
uniform float minZArray[],
|
||||
uniform float maxZArray[]
|
||||
)
|
||||
{
|
||||
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
uniform float minZ, maxZ;
|
||||
ComputeZBounds(
|
||||
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
|
||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||
// should be able to handle programCount-sized load/stores.
|
||||
export void
|
||||
SplitTileMinMax(
|
||||
uniform int32 tileMidX, uniform int32 tileMidY,
|
||||
// Subtile data (00, 10, 01, 11)
|
||||
uniform float subtileMinZ[],
|
||||
uniform float subtileMaxZ[],
|
||||
// G-buffer data
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
uniform int32 lightIndices[],
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
uniform int32 subtileIndices[],
|
||||
uniform int32 subtileIndicesPitch,
|
||||
uniform int32 subtileNumLights[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
// Normalize
|
||||
uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
|
||||
frustumPlanes_z[0] * frustumPlanes_z[0]),
|
||||
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
|
||||
frustumPlanes_z[1] * frustumPlanes_z[1]) };
|
||||
frustumPlanes_xy[0] *= norm[0];
|
||||
frustumPlanes_xy[1] *= norm[1];
|
||||
frustumPlanes_z[0] *= norm[0];
|
||||
frustumPlanes_z[1] *= norm[1];
|
||||
|
||||
// Initialize
|
||||
uniform int32 subtileLightOffset[4];
|
||||
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
foreach (i = 0 ... numLights) {
|
||||
int32 lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
cif (abs(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||
}
|
||||
cif (abs(dy) > light_attenuationEnd) {
|
||||
bool positiveY = dy > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||
}
|
||||
|
||||
// Pack and store intersecting lights
|
||||
// TODO: Experiment with a loop here instead
|
||||
cif (inFrustum[0])
|
||||
subtileLightOffset[0] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[0]],
|
||||
lightIndex);
|
||||
cif (inFrustum[1])
|
||||
subtileLightOffset[1] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[1]],
|
||||
lightIndex);
|
||||
cif (inFrustum[2])
|
||||
subtileLightOffset[2] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[2]],
|
||||
lightIndex);
|
||||
cif (inFrustum[3])
|
||||
subtileLightOffset[3] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[3]],
|
||||
lightIndex);
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||
}
|
||||
139
examples/deferred/main.cpp
Normal file
139
examples/deferred/main.cpp
Normal file
@@ -0,0 +1,139 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include "../timing.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc != 2) {
|
||||
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
InputData *input = CreateInputDataFromFile(argv[1]);
|
||||
if (!input) {
|
||||
printf("Failed to load input file \"%s\"!\n", argv[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Framebuffer framebuffer(input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
|
||||
InitDynamicC(input);
|
||||
#ifdef __cilk
|
||||
InitDynamicCilk(input);
|
||||
#endif // __cilk
|
||||
|
||||
int nframes = 5;
|
||||
double ispcCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
ispc::RenderStatic(input->header, input->arrays,
|
||||
VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer.r, framebuffer.g, framebuffer.b);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
ispcCycles = std::min(ispcCycles, mcycles);
|
||||
}
|
||||
printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
|
||||
"%d x %d image\n", ispcCycles,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilk
|
||||
double dynamicCilkCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicCilk(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||
}
|
||||
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n",
|
||||
dynamicCilkCycles);
|
||||
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||
#endif // __cilk
|
||||
|
||||
double serialCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicC(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
serialCycles = std::min(serialCycles, mcycles);
|
||||
}
|
||||
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n",
|
||||
serialCycles);
|
||||
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilk
|
||||
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
||||
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
||||
#else
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
||||
#endif // __cilk
|
||||
|
||||
DeleteInputData(input);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -18,8 +18,13 @@ EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
@@ -108,6 +113,22 @@ Global
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
|
||||
{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
||||
1685
examples/intrinsics/generic-16.h
Normal file
1685
examples/intrinsics/generic-16.h
Normal file
File diff suppressed because it is too large
Load Diff
1752
examples/intrinsics/generic-32.h
Normal file
1752
examples/intrinsics/generic-32.h
Normal file
File diff suppressed because it is too large
Load Diff
1881
examples/intrinsics/generic-64.h
Normal file
1881
examples/intrinsics/generic-64.h
Normal file
File diff suppressed because it is too large
Load Diff
2160
examples/intrinsics/knc.h
Normal file
2160
examples/intrinsics/knc.h
Normal file
File diff suppressed because it is too large
Load Diff
3887
examples/intrinsics/sse4.h
Normal file
3887
examples/intrinsics/sse4.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,26 +1,7 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
EXAMPLE=mandelbrot
|
||||
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
|
||||
ISPC_SRC=mandelbrot.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
|
||||
default: mandelbrot
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ mandelbrot
|
||||
|
||||
mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -68,38 +67,6 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 512;
|
||||
@@ -111,8 +78,6 @@ int main() {
|
||||
int maxIterations = 256;
|
||||
int *buf = new int[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -40,8 +40,10 @@ static inline int mandel(float c_re, float c_im, int count) {
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
float new_im = 2.f * z_re * z_im;
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
unmasked {
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
@@ -51,7 +53,7 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float x1, uniform float y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations,
|
||||
reference uniform int output[])
|
||||
uniform int output[])
|
||||
{
|
||||
float dx = (x1 - x0) / width;
|
||||
float dy = (y1 - y0) / height;
|
||||
@@ -60,16 +62,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
// Note that we'll be doing programCount computations in parallel,
|
||||
// so increment i by that much. This assumes that width evenly
|
||||
// divides programCount.
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
foreach (i = 0 ... width) {
|
||||
// Figure out the position on the complex plane to compute the
|
||||
// number of iterations at. Note that the x values are
|
||||
// different across different program instances, since its
|
||||
// initializer incorporates the value of the programIndex
|
||||
// variable.
|
||||
float x = x0 + (programIndex + i) * dx;
|
||||
float x = x0 + i * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = j * width + i + programIndex;
|
||||
int index = j * width + i;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
}
|
||||
|
||||
24
examples/mandelbrot/mandelbrot.vcxproj
Executable file → Normal file
24
examples/mandelbrot/mandelbrot.vcxproj
Executable file → Normal file
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -147,18 +155,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -1,41 +1,7 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
EXAMPLE=mandelbrot
|
||||
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
|
||||
ISPC_SRC=mandelbrot.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
|
||||
TASK_CXX=../tasks_pthreads.cpp
|
||||
TASK_LIB=-lpthread
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=../tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: mandelbrot
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ mandelbrot
|
||||
|
||||
mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -40,8 +40,8 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -68,39 +68,12 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
static void usage() {
|
||||
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
int main(int argc, char *argv[]) {
|
||||
unsigned int width = 1536;
|
||||
unsigned int height = 1024;
|
||||
float x0 = -2;
|
||||
@@ -108,7 +81,24 @@ int main() {
|
||||
float y0 = -1;
|
||||
float y1 = 1;
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
if (argc == 1)
|
||||
;
|
||||
else if (argc == 2) {
|
||||
if (strncmp(argv[1], "--scale=", 8) == 0) {
|
||||
float scale = atof(argv[1] + 8);
|
||||
if (scale == 0.f)
|
||||
usage();
|
||||
width *= scale;
|
||||
height *= scale;
|
||||
// round up to multiples of 16
|
||||
width = (width + 0xf) & ~0xf;
|
||||
height = (height + 0xf) & ~0xf;
|
||||
}
|
||||
else
|
||||
usage();
|
||||
}
|
||||
else
|
||||
usage();
|
||||
|
||||
int maxIterations = 512;
|
||||
int *buf = new int[width*height];
|
||||
@@ -119,6 +109,9 @@ int main() {
|
||||
//
|
||||
double minISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
reset_and_start_timer();
|
||||
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
@@ -128,9 +121,6 @@ int main() {
|
||||
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
||||
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
||||
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
@@ -138,6 +128,9 @@ int main() {
|
||||
//
|
||||
double minSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
reset_and_start_timer();
|
||||
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -41,31 +41,33 @@ mandel(float c_re, float c_im, int count) {
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
float new_im = 2.f * z_re * z_im;
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
unmasked {
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
|
||||
/* Task to compute the Mandelbrot iterations for a span of scanlines from
|
||||
[ystart,yend).
|
||||
/* Task to compute the Mandelbrot iterations for a single scanline.
|
||||
*/
|
||||
task void
|
||||
mandelbrot_scanlines(uniform int ystart, uniform int yend,
|
||||
uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int maxIterations,
|
||||
reference uniform int output[]) {
|
||||
for (uniform int j = ystart; j < yend; ++j) {
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
float x = x0 + (programIndex + i) * dx;
|
||||
float y = y0 + j * dy;
|
||||
mandelbrot_scanline(uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int height,
|
||||
uniform int span,
|
||||
uniform int maxIterations, uniform int output[]) {
|
||||
uniform int ystart = taskIndex * span;
|
||||
uniform int yend = min((taskIndex+1) * span, (unsigned int)height);
|
||||
|
||||
int index = j * width + i + programIndex;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
foreach (yi = ystart ... yend, xi = 0 ... width) {
|
||||
float x = x0 + xi * dx;
|
||||
float y = y0 + yi * dy;
|
||||
|
||||
int index = yi * width + xi;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,13 +76,11 @@ export void
|
||||
mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float x1, uniform float y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations, reference uniform int output[]) {
|
||||
uniform int maxIterations, uniform int output[]) {
|
||||
uniform float dx = (x1 - x0) / width;
|
||||
uniform float dy = (y1 - y0) / height;
|
||||
uniform int span = 4;
|
||||
|
||||
/* Launch task to compute results for spans of 'span' scanlines. */
|
||||
uniform int span = 2;
|
||||
for (uniform int j = 0; j < height; j += span)
|
||||
launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
|
||||
maxIterations, output) >;
|
||||
launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
|
||||
maxIterations, output);
|
||||
}
|
||||
|
||||
26
examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
Executable file → Normal file
26
examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
Executable file → Normal file
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -143,23 +151,23 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="mandelbrot.cpp" />
|
||||
<ClCompile Include="mandelbrot_serial.cpp" />
|
||||
<ClCompile Include="../tasks_concrt.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -1,26 +1,7 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||
EXAMPLE=noise
|
||||
CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
|
||||
ISPC_SRC=noise.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx-x2
|
||||
|
||||
default: noise
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ noise
|
||||
|
||||
noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/noise.o: objs/noise_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "noise_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -66,38 +65,6 @@ writePPM(float *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 768;
|
||||
@@ -108,8 +75,6 @@ int main() {
|
||||
|
||||
float *buf = new float[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
|
||||
@@ -131,11 +131,11 @@ static float Noise(float x, float y, float z) {
|
||||
}
|
||||
|
||||
|
||||
static float Turbulence(float x, float y, float z, int octaves) {
|
||||
static float Turbulence(float x, float y, float z, uniform int octaves) {
|
||||
float omega = 0.6;
|
||||
|
||||
float sum = 0., lambda = 1., o = 1.;
|
||||
for (int i = 0; i < octaves; ++i) {
|
||||
for (uniform int i = 0; i < octaves; ++i) {
|
||||
sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
|
||||
lambda *= 1.99f;
|
||||
o *= omega;
|
||||
|
||||
28
examples/noise/noise.vcxproj
Executable file → Normal file
28
examples/noise/noise.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -147,21 +155,21 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="noise.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -1,26 +1,7 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -g -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
EXAMPLE=options
|
||||
CPP_SRC=options.cpp options_serial.cpp
|
||||
ISPC_SRC=options.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
|
||||
default: options
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ options
|
||||
|
||||
options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/options.o: objs/options_ispc.h options_defs.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
include ../common.mk
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#define NOMINMAX
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@@ -41,7 +43,6 @@ using std::max;
|
||||
|
||||
#include "options_defs.h"
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#include "options_ispc.h"
|
||||
using namespace ispc;
|
||||
@@ -54,49 +55,32 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count);
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
static void usage() {
|
||||
printf("usage: options [--count=<num options>]\n");
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
float *S = new float[N_OPTIONS];
|
||||
float *X = new float[N_OPTIONS];
|
||||
float *T = new float[N_OPTIONS];
|
||||
float *r = new float[N_OPTIONS];
|
||||
float *v = new float[N_OPTIONS];
|
||||
float *result = new float[N_OPTIONS];
|
||||
int main(int argc, char *argv[]) {
|
||||
int nOptions = 128*1024;
|
||||
|
||||
for (int i = 0; i < N_OPTIONS; ++i) {
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strncmp(argv[i], "--count=", 8) == 0) {
|
||||
nOptions = atoi(argv[i] + 8);
|
||||
if (nOptions <= 0) {
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float *S = new float[nOptions];
|
||||
float *X = new float[nOptions];
|
||||
float *T = new float[nOptions];
|
||||
float *r = new float[nOptions];
|
||||
float *v = new float[nOptions];
|
||||
float *result = new float[nOptions];
|
||||
|
||||
for (int i = 0; i < nOptions; ++i) {
|
||||
S[i] = 100; // stock price
|
||||
X[i] = 98; // option strike price
|
||||
T[i] = 2; // time (years)
|
||||
@@ -104,61 +88,109 @@ int main() {
|
||||
v[i] = 5; // volatility
|
||||
}
|
||||
|
||||
double sum;
|
||||
|
||||
//
|
||||
// Binomial options pricing model, ispc implementation
|
||||
//
|
||||
reset_and_start_timer();
|
||||
binomial_put_ispc(S, X, T, r, v, result, N_OPTIONS);
|
||||
double binomial_ispc = get_elapsed_mcycles();
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < N_OPTIONS; ++i)
|
||||
sum += result[i];
|
||||
printf("[binomial ispc]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_ispc, sum / N_OPTIONS);
|
||||
double binomial_ispc = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
binomial_put_ispc(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
binomial_ispc = std::min(binomial_ispc, dt);
|
||||
}
|
||||
printf("[binomial ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_ispc, sum / nOptions);
|
||||
|
||||
//
|
||||
// Binomial options pricing model, ispc implementation, tasks
|
||||
//
|
||||
double binomial_tasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
binomial_tasks = std::min(binomial_tasks, dt);
|
||||
}
|
||||
printf("[binomial ispc, tasks]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_tasks, sum / nOptions);
|
||||
|
||||
//
|
||||
// Binomial options, serial implementation
|
||||
//
|
||||
reset_and_start_timer();
|
||||
binomial_put_serial(S, X, T, r, v, result, N_OPTIONS);
|
||||
double binomial_serial = get_elapsed_mcycles();
|
||||
sum = 0.f;
|
||||
for (int i = 0; i < N_OPTIONS; ++i)
|
||||
sum += result[i];
|
||||
printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_serial, sum / N_OPTIONS);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", binomial_serial / binomial_ispc);
|
||||
|
||||
//
|
||||
// Black-Scholes options pricing model, ispc implementation
|
||||
//
|
||||
sum = 0.f;
|
||||
reset_and_start_timer();
|
||||
for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
|
||||
black_scholes_ispc(S, X, T, r, v, result, N_OPTIONS);
|
||||
for (int i = 0; i < N_OPTIONS; ++i)
|
||||
double binomial_serial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
binomial_put_serial(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
binomial_serial = std::min(binomial_serial, dt);
|
||||
}
|
||||
double bs_ispc = get_elapsed_mcycles();
|
||||
printf("[black-scholes ispc]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
bs_ispc, sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
|
||||
printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_serial, sum / nOptions);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
binomial_serial / binomial_ispc, binomial_serial / binomial_tasks);
|
||||
|
||||
//
|
||||
// Black-Scholes options pricing model, ispc implementation, 1 thread
|
||||
//
|
||||
double bs_ispc = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
black_scholes_ispc(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
bs_ispc = std::min(bs_ispc, dt);
|
||||
}
|
||||
printf("[black-scholes ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n",
|
||||
bs_ispc, sum / nOptions);
|
||||
|
||||
//
|
||||
// Black-Scholes options pricing model, ispc implementation, tasks
|
||||
//
|
||||
double bs_ispc_tasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
|
||||
}
|
||||
printf("[black-scholes ispc, tasks]:\t[%.3f] million cycles (avg %f)\n",
|
||||
bs_ispc_tasks, sum / nOptions);
|
||||
|
||||
//
|
||||
// Black-Scholes options pricing model, serial implementation
|
||||
//
|
||||
sum = 0.f;
|
||||
reset_and_start_timer();
|
||||
for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
|
||||
black_scholes_serial(S, X, T, r, v, result, N_OPTIONS);
|
||||
for (int i = 0; i < N_OPTIONS; ++i)
|
||||
double bs_serial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
black_scholes_serial(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
bs_serial = std::min(bs_serial, dt);
|
||||
}
|
||||
double bs_serial = get_elapsed_mcycles();
|
||||
printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial,
|
||||
sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
|
||||
sum / nOptions);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", bs_serial / bs_ispc);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
bs_serial / bs_ispc, bs_serial / bs_ispc_tasks);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -55,49 +55,100 @@ CND(float X) {
|
||||
return w;
|
||||
}
|
||||
|
||||
export void
|
||||
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
float S = Sa[i + programIndex], X = Xa[i + programIndex];
|
||||
float T = Ta[i + programIndex], r = ra[i + programIndex];
|
||||
float v = va[i + programIndex];
|
||||
task void
|
||||
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int first = taskIndex * (count/taskCount);
|
||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
|
||||
foreach (i = first ... last) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
|
||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
||||
float d2 = d1 - v * sqrt(T);
|
||||
|
||||
result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
}
|
||||
}
|
||||
|
||||
export void
|
||||
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int nTasks = max((int)64, (int)count/16384);
|
||||
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
|
||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
||||
float d2 = d1 - v * sqrt(T);
|
||||
|
||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
binomial_put(float S, float X, float T, float r, float v) {
|
||||
float V[BINOMIAL_NUM];
|
||||
|
||||
float dt = T / BINOMIAL_NUM;
|
||||
float u = exp(v * sqrt(dt));
|
||||
float d = 1. / u;
|
||||
float disc = exp(r * dt);
|
||||
float Pu = (disc - d) / (u - d);
|
||||
|
||||
for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
|
||||
float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0., X - S * upow);
|
||||
}
|
||||
|
||||
for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
|
||||
for (uniform int k = 0; k < j; ++k)
|
||||
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
|
||||
return V[0];
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
float V[BINOMIAL_NUM];
|
||||
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
float S = Sa[i + programIndex], X = Xa[i + programIndex];
|
||||
float T = Ta[i + programIndex], r = ra[i + programIndex];
|
||||
float v = va[i + programIndex];
|
||||
|
||||
float dt = T / BINOMIAL_NUM;
|
||||
float u = exp(v * sqrt(dt));
|
||||
float d = 1. / u;
|
||||
float disc = exp(r * dt);
|
||||
float Pu = (disc - d) / (u - d);
|
||||
|
||||
for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
|
||||
float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0., X - S * upow);
|
||||
}
|
||||
|
||||
for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
|
||||
for (uniform int k = 0; k < j; ++k)
|
||||
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
|
||||
|
||||
result[i + programIndex] = V[0];
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
result[i] = binomial_put(S, X, T, r, v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
binomial_task(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int first = taskIndex * (count/taskCount);
|
||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
|
||||
foreach (i = first ... last) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
result[i] = binomial_put(S, X, T, r, v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int nTasks = max((int)64, (int)count/16384);
|
||||
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user