Compare commits
152 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2f35bc1a0f | ||
|
|
1620e0508d | ||
|
|
cb7976bbf6 | ||
|
|
5ee4d7fce8 | ||
|
|
8f3e46f67e | ||
|
|
9ed07ff2b5 | ||
|
|
32a0a30cf5 | ||
|
|
6d39d5fc3e | ||
|
|
c999c8a237 | ||
|
|
aad269fdf4 | ||
|
|
d45c536c47 | ||
|
|
f1b8e5b1bf | ||
|
|
e7a70b05af | ||
|
|
cf73286938 | ||
|
|
e6f80c0adc | ||
|
|
5e31d7b6d0 | ||
|
|
649f2ad7b7 | ||
|
|
fade1cdf1d | ||
|
|
d261105a86 | ||
|
|
b3d3e8987b | ||
|
|
4e91f3777a | ||
|
|
5584240c7f | ||
|
|
7126a39092 | ||
|
|
8ad28a3f6f | ||
|
|
9921b8e530 | ||
|
|
9052d4b10b | ||
|
|
2405dae8e6 | ||
|
|
3607f3e045 | ||
|
|
de84acfa5d | ||
|
|
a501ab1aa6 | ||
|
|
cdc850f98c | ||
|
|
ca87579f23 | ||
|
|
38fc13d1ab | ||
|
|
cf9d9f717e | ||
|
|
173632f446 | ||
|
|
1dedd88132 | ||
|
|
0848c2cc19 | ||
|
|
e2a88d491f | ||
|
|
30f9dcd4f5 | ||
|
|
0c344b6755 | ||
|
|
6734021520 | ||
|
|
dd153d3c5c | ||
|
|
9ca7541d52 | ||
|
|
0c20483853 | ||
|
|
9d4ff1bc06 | ||
|
|
83f22f1939 | ||
|
|
6375ed9224 | ||
|
|
cf23cf9ef4 | ||
|
|
1147b53dcd | ||
|
|
4cf831a651 | ||
|
|
785d8a29d3 | ||
|
|
46d2bad231 | ||
|
|
32da8e11b4 | ||
|
|
5dedb6f836 | ||
|
|
2ea6d249d5 | ||
|
|
c86128e8ee | ||
|
|
375f1cb8e8 | ||
|
|
3ca7b6b078 | ||
|
|
effe901890 | ||
|
|
4f451bd041 | ||
|
|
c76ef7b174 | ||
|
|
743d82e935 | ||
|
|
18546e9c6d | ||
|
|
f24ab16b91 | ||
|
|
766b34683c | ||
|
|
b5bfa43e92 | ||
|
|
99221f7d17 | ||
|
|
eb7913f1dd | ||
|
|
08cad7a665 | ||
|
|
9cd92facbd | ||
|
|
85063f493c | ||
|
|
f65a20c700 | ||
|
|
e144724979 | ||
|
|
96a297c747 | ||
|
|
67e00b97c6 | ||
|
|
a94cabc692 | ||
|
|
ad9e66650d | ||
|
|
6de494cfdb | ||
|
|
58e34ba4ae | ||
|
|
33feeffe5d | ||
|
|
d0db46aac5 | ||
|
|
da76396c75 | ||
|
|
bbf3fb6307 | ||
|
|
4ab982bc16 | ||
|
|
34301e09f5 | ||
|
|
84e586e767 | ||
|
|
72a2f5d2f4 | ||
|
|
606cbab0d4 | ||
|
|
54ec56c81d | ||
|
|
a322398c62 | ||
|
|
f22b3a25bd | ||
|
|
b67498766e | ||
|
|
c340ff3893 | ||
|
|
b0f59777d4 | ||
|
|
e14208f489 | ||
|
|
7756265503 | ||
|
|
f841b775c3 | ||
|
|
8c921544a0 | ||
|
|
fe54f1ad8e | ||
|
|
74c2c8ae07 | ||
|
|
87ec7aa10d | ||
|
|
206c851146 | ||
|
|
60bdf1ef8a | ||
|
|
d7662b3eb9 | ||
|
|
ecaa57c7c6 | ||
|
|
fce183c244 | ||
|
|
7a92f8b3f9 | ||
|
|
96af08e789 | ||
|
|
cb29c10660 | ||
|
|
04c93043d6 | ||
|
|
46037c7a11 | ||
|
|
c570108026 | ||
|
|
230a0fadea | ||
|
|
87cf05e0d2 | ||
|
|
ff608eef71 | ||
|
|
f868a63064 | ||
|
|
c74116aa24 | ||
|
|
8c534d4d74 | ||
|
|
d821a11c7c | ||
|
|
8a138eeb5a | ||
|
|
137ea7bde6 | ||
|
|
e05b3981d9 | ||
|
|
a5a133ccce | ||
|
|
0ac4f7b620 | ||
|
|
467f1e71d7 | ||
|
|
a2996ed5d9 | ||
|
|
7d7dd2b204 | ||
|
|
172794ba5f | ||
|
|
9ee6f86c73 | ||
|
|
a4bb6b5520 | ||
|
|
a552927a6a | ||
|
|
2d52c732f1 | ||
|
|
25676d5643 | ||
|
|
158bd6ef9e | ||
|
|
7f662de6e3 | ||
|
|
80ca02af58 | ||
|
|
8aea4a836d | ||
|
|
922dbdec06 | ||
|
|
e230d2c9ca | ||
|
|
d0674b1706 | ||
|
|
16be1d313e | ||
|
|
0932dcd98b | ||
|
|
43a619669f | ||
|
|
59036cdf5b | ||
|
|
98a2d69e72 | ||
|
|
da0fd93315 | ||
|
|
165f90357f | ||
|
|
8ef3df57c5 | ||
|
|
96d40327d0 | ||
|
|
bba7211654 | ||
|
|
2d573acd17 | ||
|
|
654cfb4b4b |
27
LICENSE.txt
27
LICENSE.txt
@@ -114,3 +114,30 @@ CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
ispc's code to convert to and from half-precision floats is based on James
|
||||
Tursa's code, which is covered by the following license:
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the distribution
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
50
Makefile
50
Makefile
@@ -10,12 +10,18 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
|
||||
-lclangSerialization -lclangParse -lclangSema \
|
||||
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
|
||||
|
||||
LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
|
||||
ISPC_LIBS=$(CLANG_LIBS) \
|
||||
$(shell llvm-config --ldflags --libs) \
|
||||
-lpthread -ldl
|
||||
ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
|
||||
-lpthread -ldl
|
||||
|
||||
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
|
||||
LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
|
||||
LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
|
||||
LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
|
||||
|
||||
BUILD_DATE=$(shell date +%Y%m%d)
|
||||
BUILD_VERSION=$(shell git log | head -1)
|
||||
BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
|
||||
|
||||
CXX=g++
|
||||
CPP=cpp
|
||||
@@ -43,17 +49,19 @@ CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
|
||||
util.cpp
|
||||
HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
STDLIB_SRC=stdlib-avx.ll stdlib-sse2.ll stdlib-sse4.ll stdlib-sse4x2.ll
|
||||
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
|
||||
builtins-sse4.ll builtins-sse4x2.ll
|
||||
BISON_SRC=parse.yy
|
||||
FLEX_SRC=lex.ll
|
||||
|
||||
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdlib_ispc.o \
|
||||
$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
|
||||
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
|
||||
builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
|
||||
$(FLEX_SRC:.ll=.o))
|
||||
|
||||
default: ispc ispc_test
|
||||
|
||||
.PHONY: dirs clean depend doxygen print_llvm_src
|
||||
.PRECIOUS: objs/stdlib-%.cpp
|
||||
.PRECIOUS: objs/builtins-%.cpp
|
||||
|
||||
depend: $(CXX_SRC) $(HEADERS)
|
||||
@echo Updating dependencies
|
||||
@@ -77,11 +85,11 @@ doxygen:
|
||||
|
||||
ispc: print_llvm_src dirs $(OBJS)
|
||||
@echo Creating ispc executable
|
||||
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(CLANG_LIBS) $(LLVM_LIBS)
|
||||
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
|
||||
|
||||
ispc_test: dirs ispc_test.cpp
|
||||
@echo Creating ispc_test executable
|
||||
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
|
||||
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
@echo Compiling $<
|
||||
@@ -103,19 +111,27 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/stdlib-%.cpp: stdlib-%.ll stdlib.m4 stdlib-sse.ll
|
||||
@echo Creating C++ source from stdlib file $<
|
||||
@m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@
|
||||
objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
|
||||
@echo Creating C++ source from builtin definitions file $<
|
||||
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
||||
|
||||
objs/stdlib-%.o: objs/stdlib-%.cpp
|
||||
objs/builtins-%.o: objs/builtins-%.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/stdlib-c.cpp: stdlib-c.c
|
||||
@echo Creating C++ source from stdlib file $<
|
||||
@$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@
|
||||
objs/builtins-c-32.cpp: builtins-c.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
|
||||
|
||||
objs/stdlib-c.o: objs/stdlib-c.cpp
|
||||
objs/builtins-c-32.o: objs/builtins-c-32.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-c-64.cpp: builtins-c.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
|
||||
|
||||
objs/builtins-c-64.o: objs/builtins-c-64.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
|
||||
@@ -15,8 +15,8 @@ code.
|
||||
|
||||
ispc is an open source compiler under the BSD license; see the file
|
||||
LICENSE.txt. ispc supports Windows, Mac, and Linux, with both x86 and
|
||||
x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
|
||||
though support for AVX should be available soon.
|
||||
x86-64 targets. It currently supports the SSE2, SSE4, and AVX instruction
|
||||
sets.
|
||||
|
||||
For more information and examples, as well as a wiki and the bug database,
|
||||
see the ispc distribution site, http://ispc.github.com.
|
||||
|
||||
@@ -4,30 +4,36 @@ import sys
|
||||
import string
|
||||
import re
|
||||
import subprocess
|
||||
import platform
|
||||
import os
|
||||
|
||||
length=0
|
||||
|
||||
src=str(sys.argv[1])
|
||||
|
||||
target = re.sub(".*stdlib-", "", src)
|
||||
target = re.sub(".*builtins-", "", src)
|
||||
target = re.sub("\.ll$", "", target)
|
||||
target = re.sub("\.c$", "", target)
|
||||
target = re.sub("-", "_", target)
|
||||
|
||||
llvm_as="llvm-as"
|
||||
if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
|
||||
llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
|
||||
|
||||
try:
|
||||
as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||
as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||
except IOError:
|
||||
print >> sys.stderr, "Couldn't open " + src
|
||||
sys.exit(1)
|
||||
|
||||
print "unsigned char stdlib_bitcode_" + target + "[] = {"
|
||||
print "unsigned char builtins_bitcode_" + target + "[] = {"
|
||||
for line in as_out.stdout.readlines():
|
||||
length = length + len(line)
|
||||
for c in line:
|
||||
print ord(c)
|
||||
print ", "
|
||||
print " 0 };\n\n"
|
||||
print "int stdlib_bitcode_" + target + "_length = " + str(length) + ";\n"
|
||||
print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
|
||||
|
||||
as_out.wait()
|
||||
|
||||
|
||||
278
builtins-avx-common.ll
Normal file
278
builtins-avx-common.ll
Normal file
@@ -0,0 +1,278 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fastmath
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
665
builtins-avx-x2.ll
Normal file
665
builtins-avx-x2.ll
Normal file
@@ -0,0 +1,665 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 16-wide definitions
|
||||
|
||||
stdlib_core(16)
|
||||
packed_load_and_store(16)
|
||||
scans(16)
|
||||
int64minmax(16)
|
||||
|
||||
include(`builtins-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul <16 x float> %0, %call
|
||||
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <16 x float> %call, %two_minus
|
||||
ret <16 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
round8to16(%0, 8)
|
||||
}
|
||||
|
||||
define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round8to16(%0, 9)
|
||||
}
|
||||
|
||||
define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round8to16(%0, 10)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 8)
|
||||
}
|
||||
|
||||
define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 9)
|
||||
}
|
||||
|
||||
define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 10)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <16 x float> %v, %is
|
||||
%v_is_is = fmul <16 x float> %v_is, %is
|
||||
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <16 x float> %is, %three_sub
|
||||
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <16 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||
; or, use the macro to call the 4-wide ones 4x with our 16-wide
|
||||
; vectors...
|
||||
|
||||
declare <16 x float> @__svml_sin(<16 x float>)
|
||||
declare <16 x float> @__svml_cos(<16 x float>)
|
||||
declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
|
||||
declare <16 x float> @__svml_tan(<16 x float>)
|
||||
declare <16 x float> @__svml_atan(<16 x float>)
|
||||
declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
|
||||
declare <16 x float> @__svml_exp(<16 x float>)
|
||||
declare <16 x float> @__svml_log(<16 x float>)
|
||||
declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal <16 x float> @__max_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
define internal <16 x float> @__min_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
|
||||
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
|
||||
|
||||
%v1shift = shl i32 %v1, 8
|
||||
%v = or i32 %v1shift, %v0
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal float ops
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
%va = shufflevector <16 x float> %0, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vb = shufflevector <16 x float> %0, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||
%v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
|
||||
%scalar1 = extractelement <8 x float> %v3, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v3, i32 4
|
||||
%sum = fadd float %scalar1, %scalar2
|
||||
ret float %sum
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
reduce16(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
reduce16(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
reduce_equal(16)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
define internal <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||
<16 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i32> %0, %1
|
||||
ret <16 x i32> %s
|
||||
}
|
||||
|
||||
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%s = add i32 %0, %1
|
||||
ret i32 %s
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal double ops
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
%va = shufflevector <16 x double> %0, <16 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%vb = shufflevector <16 x double> %0, <16 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%vc = shufflevector <16 x double> %0, <16 x double> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%vd = shufflevector <16 x double> %0, <16 x double> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
%vab = fadd <4 x double> %va, %vb
|
||||
%vcd = fadd <4 x double> %vc, %vd
|
||||
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||
%sum = fadd double %final0, %final1
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
reduce16(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
reduce16(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int64 ops
|
||||
|
||||
define internal <16 x i64> @__add_varying_int64(<16 x i64>,
|
||||
<16 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i64> %0, %1
|
||||
ret <16 x i64> %s
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%s = add i64 %0, %1
|
||||
ret i64 %s
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(16, i8, 8)
|
||||
load_and_broadcast(16, i16, 16)
|
||||
load_and_broadcast(16, i32, 32)
|
||||
load_and_broadcast(16, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
load_masked(16, i8, 8, 1)
|
||||
load_masked(16, i16, 16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %mask to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
|
||||
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%ptr1 = getelementptr i8 * %0, i32 32 ;; 8x4 bytes = 32
|
||||
%val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
|
||||
|
||||
%retval = shufflevector <8 x float> %val0, <8 x float> %val1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%reti32 = bitcast <16 x float> %retval to <16 x i32>
|
||||
ret <16 x i32> %reti32
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
%mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
|
||||
%mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
|
||||
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||
%mask2d = bitcast <8 x i32> %mask2 to <4 x double>
|
||||
%mask3d = bitcast <8 x i32> %mask3 to <4 x double>
|
||||
|
||||
%val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
|
||||
%ptr1 = getelementptr i8 * %0, i32 32
|
||||
%val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
|
||||
%ptr2 = getelementptr i8 * %0, i32 64
|
||||
%val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
|
||||
%ptr3 = getelementptr i8 * %0, i32 96
|
||||
%val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
|
||||
|
||||
%val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%val23 = shufflevector <4 x double> %val2d, <4 x double> %val3d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%val0123 = shufflevector <8 x double> %val01, <8 x double> %val23,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%val = bitcast <16 x double> %val0123 to <16 x i64>
|
||||
ret <16 x i64> %val
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(16, i8, 8)
|
||||
gen_masked_store(16, i16, 16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <16 x i32> * %0 to i8 *
|
||||
%val = bitcast <16 x i32> %1 to <16 x float>
|
||||
%mask = bitcast <16 x i32> %2 to <16 x float>
|
||||
|
||||
%val0 = shufflevector <16 x float> %val, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%val1 = shufflevector <16 x float> %val, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
|
||||
%ptr1 = getelementptr i8 * %ptr, i32 32
|
||||
call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <16 x i64> * %0 to i8 *
|
||||
%val = bitcast <16 x i64> %1 to <16 x double>
|
||||
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
%mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
|
||||
%mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
|
||||
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||
%mask2d = bitcast <8 x i32> %mask2 to <4 x double>
|
||||
%mask3d = bitcast <8 x i32> %mask3 to <4 x double>
|
||||
|
||||
%val0 = shufflevector <16 x double> %val, <16 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%val1 = shufflevector <16 x double> %val, <16 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%val2 = shufflevector <16 x double> %val, <16 x double> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%val3 = shufflevector <16 x double> %val, <16 x double> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
|
||||
%ptr1 = getelementptr i8 * %ptr, i32 32
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
|
||||
%ptr2 = getelementptr i8 * %ptr, i32 64
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
|
||||
%ptr3 = getelementptr i8 * %ptr, i32 96
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
masked_store_blend_8_16_by_16()
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
||||
<16 x i32>) nounwind alwaysinline {
|
||||
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
|
||||
%oldValue = load <16 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
|
||||
%newAsFloat = bitcast <16 x i32> %1 to <16 x float>
|
||||
|
||||
%old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
|
||||
<8 x float> %new0,
|
||||
<8 x float> %mask0)
|
||||
%blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
|
||||
<8 x float> %new1,
|
||||
<8 x float> %mask1)
|
||||
%blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%blendAsInt = bitcast <16 x float> %blend to <16 x i32>
|
||||
store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||
<4 x double>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||
<16 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <16 x i64>* %ptr, align 8
|
||||
%old = bitcast <16 x i64> %oldValue to <16 x double>
|
||||
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%old1d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%old2d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%old3d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%new = bitcast <16 x i64> %newi64 to <16 x double>
|
||||
%new0d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%new1d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%new2d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%new3d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
|
||||
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
%mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
|
||||
%mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||
<8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
|
||||
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||
%mask2d = bitcast <8 x i32> %mask2 to <4 x double>
|
||||
%mask3d = bitcast <8 x i32> %mask3 to <4 x double>
|
||||
|
||||
%result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
|
||||
<4 x double> %new0d, <4 x double> %mask0d)
|
||||
%result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
|
||||
<4 x double> %new1d, <4 x double> %mask1d)
|
||||
%result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
|
||||
<4 x double> %new2d, <4 x double> %mask2d)
|
||||
%result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
|
||||
<4 x double> %new3d, <4 x double> %mask3d)
|
||||
|
||||
%result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%result = shufflevector <8 x double> %result01, <8 x double> %result23,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%result64 = bitcast <16 x double> %result to <16 x i64>
|
||||
store <16 x i64> %result64, <16 x i64> * %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
|
||||
gen_scatter(16, i8)
|
||||
gen_scatter(16, i16)
|
||||
gen_scatter(16, i32)
|
||||
gen_scatter(16, i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||
|
||||
define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
|
||||
unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
@@ -41,14 +41,15 @@
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
int8_16(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
|
||||
include(`builtins-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
@@ -63,25 +64,10 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
|
||||
ret <8 x float> %iv_mul
|
||||
}
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
@@ -89,111 +75,43 @@ define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonl
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round4to8double(%0, 8)
|
||||
}
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
round4to8double(%0, 9)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
round4to8double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
@@ -201,64 +119,24 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <8 x float> %v, %is
|
||||
%v_is_is = fmul <8 x float> %v_is, %is
|
||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <8 x float> %is, %three_sub
|
||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <8 x float> %half_scale
|
||||
}
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fastmath
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
@@ -280,9 +158,7 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
|
||||
;; float min/max
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__max_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
@@ -290,97 +166,43 @@ define internal <8 x float> @__max_varying_float(<8 x float>,
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__min_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
declare <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.sd.256, %0, %1)
|
||||
ret i32 %ret
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.sd.256, %0, %1)
|
||||
ret i32 %ret
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
; FIXME: looks like these aren't available in LLVM?
|
||||
declare <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
declare <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.ud.256, %0, %1)
|
||||
ret i32 %ret
|
||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.ud.256, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
@@ -413,6 +235,7 @@ define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysi
|
||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
reduce_equal(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
@@ -473,9 +296,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%scalar1 = extractelement <4 x double> %sum0, i32 0
|
||||
%scalar2 = extractelement <4 x double> %sum1, i32 1
|
||||
%sum = fadd double %scalar1, %scalar2
|
||||
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||
%sum = fadd double %final0, %final1
|
||||
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
@@ -539,55 +363,14 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinli
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%mm = call i32 @__movmsk(<8 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
; TODO: make sure this becomes a vbroadcast...
|
||||
%ptr = bitcast i8 * %0 to i32 *
|
||||
%val = load i32 * %ptr
|
||||
|
||||
%ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
|
||||
%ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
|
||||
%ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
|
||||
%ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
|
||||
%ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
|
||||
%ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
|
||||
%ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
|
||||
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
|
||||
ret <8 x i32> %ret7
|
||||
|
||||
skip:
|
||||
ret <8 x i32> undef
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%mm = call i32 @__movmsk(<8 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
; TODO: make sure this becomes a vbroadcast...
|
||||
%ptr = bitcast i8 * %0 to i64 *
|
||||
%val = load i64 * %ptr
|
||||
|
||||
%ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
|
||||
%ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
|
||||
%ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
|
||||
%ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
|
||||
%ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
|
||||
%ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
|
||||
%ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
|
||||
%ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
|
||||
ret <8 x i64> %ret3
|
||||
|
||||
skip:
|
||||
ret <8 x i64> undef
|
||||
}
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
@@ -623,6 +406,12 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
@@ -661,12 +450,14 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
}
|
||||
|
||||
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||
%oldValue = load <8 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||
@@ -680,7 +471,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
%mask = bitcast <8 x i32> %i32mask to <8 x float>
|
||||
@@ -730,56 +521,44 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
@@ -31,7 +31,7 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file stdlib-c.c
|
||||
/** @file builtins-c.c
|
||||
@brief Standard library function implementations written in C.
|
||||
|
||||
This file provides C implementations of various functions that can be
|
||||
@@ -51,6 +51,10 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#include <unistd.h>
|
||||
#endif // !_MSC_VER
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
@@ -139,3 +143,28 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
int __num_cores() {
|
||||
#ifdef _MSC_VER
|
||||
// This is quite a hack. Including all of windows.h to get this definition
|
||||
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||
// here. Presumably this struct declaration won't be changing in the future
|
||||
// anyway...
|
||||
struct SYSTEM_INFO {
|
||||
int pad0[2];
|
||||
void *pad1[2];
|
||||
int *pad2;
|
||||
int dwNumberOfProcessors;
|
||||
int pad3[3];
|
||||
};
|
||||
|
||||
struct SYSTEM_INFO sysInfo;
|
||||
extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
|
||||
GetSystemInfo(&sysInfo);
|
||||
return sysInfo.dwNumberOfProcessors;
|
||||
#else
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
#endif // !_MSC_VER
|
||||
}
|
||||
@@ -31,12 +31,11 @@
|
||||
|
||||
;; This file declares implementations of various stdlib builtins that
|
||||
;; only require SSE version 1 and 2 functionality; this file, in turn
|
||||
;; is then included by stdlib-sse2.ll and stdlib-sse4.ll to provide
|
||||
;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
|
||||
;; those definitions for them.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
int8_16(4)
|
||||
int64minmax(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -125,18 +124,19 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math mode
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -376,33 +376,28 @@ define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
|
||||
per_lane(4, <4 x i32> %2, `
|
||||
; compute address for this one
|
||||
%ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <4 x i32> %1, i32 LANE
|
||||
store i32 %storeval_ID, i32 * %ptr_ID')
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
|
||||
per_lane(4, <4 x i32> %2, `
|
||||
%ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <4 x i64> %1, i32 LANE
|
||||
store i64 %storeval_ID, i64 * %ptr_ID')
|
||||
ret void
|
||||
}
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
load_masked(4, i8, 8, 1)
|
||||
load_masked(4, i16, 16, 2)
|
||||
load_masked(4, i32, 32, 4)
|
||||
load_masked(4, i64, 64, 8)
|
||||
|
||||
@@ -411,7 +406,12 @@ load_masked(4, i64, 64, 8)
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
@@ -35,9 +35,10 @@
|
||||
; Define some basics for a 4-wide target
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
|
||||
; Include the various definitions of things that only require SSE1 and SSE2
|
||||
include(`stdlib-sse.ll')
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
@@ -276,41 +277,17 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
; FIXME: this is very inefficient, loops over all 32 bits...
|
||||
|
||||
; we could use the LLVM intrinsic declare i32 @llvm.ctpop.i32(i32),
|
||||
; although that currently ends up generating a POPCNT instruction even
|
||||
; if we give --target=sse2 on the command line. We probably need to
|
||||
; pipe through the 'sse2' request to LLVM via the 'features' string
|
||||
; at codegen time... (If e.g. --cpu=penryn is also passed along, then
|
||||
; it does generate non-POPCNT code and in particular better code than
|
||||
; the below does.)
|
||||
declare i32 @llvm.ctpop.i32(i32)
|
||||
declare i64 @llvm.ctpop.i64(i64)
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
|
||||
%val = phi i32 [ %0, %entry ], [ %newval, %loop ]
|
||||
%delta = and i32 %val, 1
|
||||
%newcount = add i32 %count, %delta
|
||||
%newval = lshr i32 %val, 1
|
||||
%done = icmp eq i32 %newval, 0
|
||||
br i1 %done, label %exit, label %loop
|
||||
|
||||
exit:
|
||||
ret i32 %newcount
|
||||
%val = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
%vec = bitcast i64 %0 to <2 x i32>
|
||||
%v0 = extractelement <2 x i32> %vec, i32 0
|
||||
%v1 = extractelement <2 x i32> %vec, i32 1
|
||||
%c0 = call i32 @__popcnt_int32(i32 %v0)
|
||||
%c1 = call i32 @__popcnt_int32(i32 %v1)
|
||||
%sum = add i32 %c0, %c1
|
||||
ret i32 %sum
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
%val = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %val
|
||||
}
|
||||
|
||||
|
||||
@@ -35,9 +35,10 @@
|
||||
; Define common 4-wide stuff
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
|
||||
; Define the stuff that can be done with base SSE1/SSE2 instructions
|
||||
include(`stdlib-sse.ll')
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
@@ -76,7 +77,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
|
||||
}
|
||||
|
||||
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
@@ -84,14 +85,14 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
@@ -99,7 +100,7 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
@@ -123,28 +124,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
|
||||
}
|
||||
|
||||
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round2to4double(%0, 9)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round2to4double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
@@ -229,7 +230,6 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi
|
||||
ret float %scalar
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
int8_16(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -128,22 +128,22 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
@@ -435,44 +435,31 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
per_lane(8, <8 x i32> %2, `
|
||||
; compute address for this one
|
||||
%ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <8 x i32> %1, i32 LANE
|
||||
store i32 %storeval_ID, i32 * %ptr_ID')
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
per_lane(8, <8 x i32> %2, `
|
||||
; compute address for this one
|
||||
%ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <8 x i64> %1, i32 LANE
|
||||
store i64 %storeval_ID, i64 * %ptr_ID')
|
||||
ret void
|
||||
}
|
||||
|
||||
reduce_equal(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
load_masked(8, i32, 32, 4)
|
||||
load_masked(8, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
|
||||
@@ -511,28 +498,28 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
|
||||
}
|
||||
|
||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round4to8(%0, 9)
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round4to8(%0, 10)
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
@@ -556,28 +543,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
|
||||
}
|
||||
|
||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round2to8double(%0, 9)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round2to8double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
@@ -619,6 +606,13 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(8, i32, 32)
|
||||
gen_masked_store(8, i64, 64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
482
builtins.cpp
482
builtins.cpp
@@ -52,7 +52,10 @@
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/Linker.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/ADT/Triple.h>
|
||||
#include <llvm/Support/MemoryBuffer.h>
|
||||
#include <llvm/Bitcode/ReaderWriter.h>
|
||||
|
||||
@@ -67,7 +70,7 @@ extern yy_buffer_state *yy_scan_string(const char *);
|
||||
distinguish between signed and unsigned integers in its types.)
|
||||
|
||||
Because this function is only used for generating ispc declarations of
|
||||
functions defined in LLVM bitcode in the stdlib-*.ll files, in practice
|
||||
functions defined in LLVM bitcode in the builtins-*.ll files, in practice
|
||||
we can get enough of what we need for the relevant cases to make things
|
||||
work, partially with the help of the intAsUnsigned parameter, which
|
||||
indicates whether LLVM integer types should be treated as being signed
|
||||
@@ -78,8 +81,14 @@ static const Type *
|
||||
lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
if (t == LLVMTypes::VoidType)
|
||||
return AtomicType::Void;
|
||||
|
||||
// uniform
|
||||
else if (t == LLVMTypes::BoolType)
|
||||
return AtomicType::UniformBool;
|
||||
else if (t == LLVMTypes::Int8Type)
|
||||
return intAsUnsigned ? AtomicType::UniformUInt8 : AtomicType::UniformInt8;
|
||||
else if (t == LLVMTypes::Int16Type)
|
||||
return intAsUnsigned ? AtomicType::UniformUInt16 : AtomicType::UniformInt16;
|
||||
else if (t == LLVMTypes::Int32Type)
|
||||
return intAsUnsigned ? AtomicType::UniformUInt32 : AtomicType::UniformInt32;
|
||||
else if (t == LLVMTypes::FloatType)
|
||||
@@ -88,6 +97,12 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
return AtomicType::UniformDouble;
|
||||
else if (t == LLVMTypes::Int64Type)
|
||||
return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
|
||||
|
||||
// varying
|
||||
else if (t == LLVMTypes::Int8VectorType)
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
|
||||
else if (t == LLVMTypes::Int16VectorType)
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
|
||||
else if (t == LLVMTypes::Int32VectorType)
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32;
|
||||
else if (t == LLVMTypes::FloatVectorType)
|
||||
@@ -96,6 +111,14 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
return AtomicType::VaryingDouble;
|
||||
else if (t == LLVMTypes::Int64VectorType)
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
|
||||
|
||||
// pointers to uniform
|
||||
else if (t == LLVMTypes::Int8PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
|
||||
AtomicType::UniformInt8, false);
|
||||
else if (t == LLVMTypes::Int16PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
|
||||
AtomicType::UniformInt16, false);
|
||||
else if (t == LLVMTypes::Int32PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
|
||||
AtomicType::UniformInt32, false);
|
||||
@@ -106,6 +129,14 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
return new ReferenceType(AtomicType::UniformFloat, false);
|
||||
else if (t == LLVMTypes::DoublePointerType)
|
||||
return new ReferenceType(AtomicType::UniformDouble, false);
|
||||
|
||||
// pointers to varying
|
||||
else if (t == LLVMTypes::Int8VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
|
||||
AtomicType::VaryingInt8, false);
|
||||
else if (t == LLVMTypes::Int16VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
|
||||
AtomicType::VaryingInt16, false);
|
||||
else if (t == LLVMTypes::Int32VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
|
||||
AtomicType::VaryingInt32, false);
|
||||
@@ -116,6 +147,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
return new ReferenceType(AtomicType::VaryingFloat, false);
|
||||
else if (t == LLVMTypes::DoubleVectorPointerType)
|
||||
return new ReferenceType(AtomicType::VaryingDouble, false);
|
||||
|
||||
// arrays
|
||||
else if (llvm::isa<const llvm::PointerType>(t)) {
|
||||
const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
|
||||
|
||||
@@ -139,6 +172,27 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lCreateSymbol(const std::string &name, const Type *returnType,
|
||||
const std::vector<const Type *> &argTypes,
|
||||
const llvm::FunctionType *ftype, llvm::Function *func,
|
||||
SymbolTable *symbolTable) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
// set NULL default arguments
|
||||
std::vector<ConstExpr *> defaults;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
|
||||
defaults.push_back(NULL);
|
||||
funcType->SetArgumentDefaults(defaults);
|
||||
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
symbolTable->AddFunction(sym);
|
||||
}
|
||||
|
||||
|
||||
/** Given an LLVM function declaration, synthesize the equivalent ispc
|
||||
symbol for the function (if possible). Returns true on success, false
|
||||
on failure.
|
||||
@@ -190,7 +244,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
|
||||
// Iterate over the arguments and try to find their equivalent ispc
|
||||
// types. Track if any of the arguments has an integer type.
|
||||
bool anyIntArgs = false;
|
||||
bool anyIntArgs = false, anyReferenceArgs = false;
|
||||
std::vector<const Type *> argTypes;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
|
||||
const llvm::Type *llvmArgType = ftype->getParamType(j);
|
||||
@@ -199,22 +253,26 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
return false;
|
||||
anyIntArgs |=
|
||||
(Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
|
||||
anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
|
||||
argTypes.push_back(type);
|
||||
}
|
||||
|
||||
// Always create the symbol the first time through, in particular
|
||||
// so that we get symbols for things with no integer types!
|
||||
if (i == 0 || anyIntArgs == true) {
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
// set NULL default arguments
|
||||
std::vector<ConstExpr *> defaults;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
|
||||
defaults.push_back(NULL);
|
||||
funcType->SetArgumentDefaults(defaults);
|
||||
if (i == 0 || anyIntArgs == true)
|
||||
lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
|
||||
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
symbolTable->AddFunction(sym);
|
||||
// If there are any reference types, also make a variant of the
|
||||
// symbol that has them as const references. This obviously
|
||||
// doesn't make sense for many builtins, but we'll give the stdlib
|
||||
// the option to call one if it needs one.
|
||||
if (anyReferenceArgs == true) {
|
||||
for (unsigned int j = 0; j < argTypes.size(); ++j) {
|
||||
if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
|
||||
argTypes[j] = argTypes[j]->GetAsConstType();
|
||||
lCreateSymbol(name + "_refsconst", returnType, argTypes,
|
||||
ftype, func, symbolTable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,203 +297,32 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
}
|
||||
}
|
||||
|
||||
/** Declare the 'pseudo-gather' functions. When the ispc front-end needs
|
||||
to perform a gather, it generates a call to one of these functions,
|
||||
which have signatures:
|
||||
|
||||
varying int32 __pseudo_gather(varying int32 *, mask)
|
||||
varying int64 __pseudo_gather(varying int64 *, mask)
|
||||
|
||||
These functions are never actually implemented; the
|
||||
GatherScatterFlattenOpt optimization pass finds them and then converts
|
||||
them to make calls to the following functions, which represent gathers
|
||||
from a common base pointer with offsets. This approach allows the
|
||||
front-end to be relatively simple in how it emits address calculation
|
||||
for gathers.
|
||||
|
||||
varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base,
|
||||
int32 offsets, mask)
|
||||
varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base,
|
||||
int64 offsets, mask)
|
||||
|
||||
Then, the GSImprovementsPass optimizations finds these and either
|
||||
converts them to native gather functions or converts them to vector
|
||||
loads, if equivalent.
|
||||
*/
|
||||
static void
|
||||
lDeclarePseudoGathers(llvm::Module *module) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerVectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_gather_32", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
|
||||
fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
|
||||
func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_gather_64", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_gather_base_offsets_32", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
|
||||
fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
|
||||
func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_gather_base_offsets_64", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Similarly to the 'pseudo-gathers' defined by lDeclarePseudoGathers(),
|
||||
we also declare (but never define) pseudo-scatter instructions with
|
||||
signatures:
|
||||
|
||||
void __pseudo_scatter_32(varying int32 *, varying int32 values, mask)
|
||||
void __pseudo_scatter_64(varying int64 *, varying int64 values, mask)
|
||||
|
||||
The GatherScatterFlattenOpt optimization pass also finds these and
|
||||
transforms them to scatters like:
|
||||
|
||||
void __pseudo_scatter_base_offsets_32(uniform int32 *base,
|
||||
varying int32 offsets, varying int32 values, mask)
|
||||
void __pseudo_scatter_base_offsets_64(uniform int64 *base,
|
||||
varying int62 offsets, varying int64 values, mask)
|
||||
|
||||
And the GSImprovementsPass in turn converts these to actual native
|
||||
scatters or masked stores.
|
||||
/** In many of the builtins-*.ll files, we have declarations of various LLVM
|
||||
intrinsics that are then used in the implementation of various target-
|
||||
specific functions. This function loops over all of the intrinsic
|
||||
declarations and makes sure that the signature we have in our .ll file
|
||||
matches the signature of the actual intrinsic.
|
||||
*/
|
||||
static void
|
||||
lDeclarePseudoScatters(llvm::Module *module) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
llvm::Module::iterator iter;
|
||||
for (iter = module->begin(); iter != module->end(); ++iter) {
|
||||
llvm::Function *func = iter;
|
||||
if (!func->isIntrinsic())
|
||||
continue;
|
||||
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerVectorType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_scatter_32", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerVectorType);
|
||||
argTypes.push_back(LLVMTypes::Int64VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_scatter_64", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_scatter_base_offsets_32", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::Int64VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_scatter_base_offsets_64", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** This function declares placeholder masked store functions for the
|
||||
front-end to use.
|
||||
|
||||
void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
|
||||
void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
|
||||
|
||||
These in turn are converted to native masked stores or to regular
|
||||
stores (if the mask is all on) by the MaskedStoreOptPass optimization
|
||||
pass.
|
||||
*/
|
||||
static void
|
||||
lDeclarePseudoMaskedStore(llvm::Module *module) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::Int32VectorPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_masked_store_32", module);
|
||||
func->setDoesNotThrow(true);
|
||||
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
func->setDoesNotCapture(1, true);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::Int64VectorPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int64VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_masked_store_64", module);
|
||||
func->setDoesNotThrow(true);
|
||||
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
func->setDoesNotCapture(1, true);
|
||||
const std::string funcName = func->getName().str();
|
||||
// Work around http://llvm.org/bugs/show_bug.cgi?id=10438; only
|
||||
// check the llvm.x86.* intrinsics for now...
|
||||
if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
|
||||
llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
|
||||
assert(id != 0);
|
||||
LLVM_TYPE_CONST llvm::Type *intrinsicType =
|
||||
llvm::Intrinsic::getType(*g->ctx, id);
|
||||
intrinsicType = llvm::PointerType::get(intrinsicType, 0);
|
||||
assert(func->getType() == intrinsicType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -459,10 +346,27 @@ lAddBitcode(const unsigned char *bitcode, int length,
|
||||
if (!bcModule)
|
||||
Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str());
|
||||
else {
|
||||
// FIXME: this feels like a bad idea, but the issue is that when we
|
||||
// set the llvm::Module's target triple in the ispc Module::Module
|
||||
// constructor, we start by calling llvm::sys::getHostTriple() (and
|
||||
// then change the arch if needed). Somehow that ends up giving us
|
||||
// strings like 'x86_64-apple-darwin11.0.0', while the stuff we
|
||||
// compile to bitcode with clang has module triples like
|
||||
// 'i386-apple-macosx10.7.0'. And then LLVM issues a warning about
|
||||
// linking together modules with incompatible target triples..
|
||||
llvm::Triple mTriple(m->module->getTargetTriple());
|
||||
llvm::Triple bcTriple(bcModule->getTargetTriple());
|
||||
assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
||||
mTriple.getArch() == bcTriple.getArch());
|
||||
assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
|
||||
mTriple.getVendor() == bcTriple.getVendor());
|
||||
bcModule->setTargetTriple(mTriple.str());
|
||||
|
||||
std::string(linkError);
|
||||
if (llvm::Linker::LinkModules(module, bcModule, &linkError))
|
||||
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
||||
lAddModuleSymbols(module, symbolTable);
|
||||
lCheckModuleIntrinsics(module);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -476,7 +380,7 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
|
||||
pw->isStatic = true;
|
||||
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
||||
const llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
llvm::Constant *linit = LLVMInt32(val);
|
||||
pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
@@ -485,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
std::vector<const Type *> args;
|
||||
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||
Symbol *sym = new Symbol(name, SourcePos(), ft);
|
||||
sym->isStatic = true;
|
||||
|
||||
llvm::Function *func = module->getFunction(name);
|
||||
assert(func != NULL); // it should be declared already...
|
||||
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
|
||||
llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
|
||||
|
||||
sym->function = func;
|
||||
symbolTable->AddVariable(sym);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||
@@ -496,7 +421,7 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
pi[i] = i;
|
||||
pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
|
||||
|
||||
const llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
llvm::Constant *linit = LLVMInt32Vector(pi);
|
||||
pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage, linit,
|
||||
@@ -508,32 +433,41 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
void
|
||||
DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||
bool includeStdlibISPC) {
|
||||
// Add the definitions from the compiled stdlib-c.c file
|
||||
extern unsigned char stdlib_bitcode_c[];
|
||||
extern int stdlib_bitcode_c_length;
|
||||
lAddBitcode(stdlib_bitcode_c, stdlib_bitcode_c_length, module, symbolTable);
|
||||
// Add the definitions from the compiled builtins-c.c file
|
||||
if (g->target.is32bit) {
|
||||
extern unsigned char builtins_bitcode_c_32[];
|
||||
extern int builtins_bitcode_c_32_length;
|
||||
lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
else {
|
||||
extern unsigned char builtins_bitcode_c_64[];
|
||||
extern int builtins_bitcode_c_64_length;
|
||||
lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
|
||||
// Next, add the target's custom implementations of the various needed
|
||||
// builtin functions (e.g. __masked_store_32(), etc).
|
||||
switch (g->target.isa) {
|
||||
case Target::SSE2:
|
||||
extern unsigned char stdlib_bitcode_sse2[];
|
||||
extern int stdlib_bitcode_sse2_length;
|
||||
lAddBitcode(stdlib_bitcode_sse2, stdlib_bitcode_sse2_length, module,
|
||||
extern unsigned char builtins_bitcode_sse2[];
|
||||
extern int builtins_bitcode_sse2_length;
|
||||
lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
|
||||
symbolTable);
|
||||
break;
|
||||
case Target::SSE4:
|
||||
extern unsigned char stdlib_bitcode_sse4[];
|
||||
extern int stdlib_bitcode_sse4_length;
|
||||
extern unsigned char stdlib_bitcode_sse4x2[];
|
||||
extern int stdlib_bitcode_sse4x2_length;
|
||||
extern unsigned char builtins_bitcode_sse4[];
|
||||
extern int builtins_bitcode_sse4_length;
|
||||
extern unsigned char builtins_bitcode_sse4x2[];
|
||||
extern int builtins_bitcode_sse4x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
lAddBitcode(stdlib_bitcode_sse4, stdlib_bitcode_sse4_length,
|
||||
lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
lAddBitcode(stdlib_bitcode_sse4x2, stdlib_bitcode_sse4x2_length,
|
||||
lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
@@ -541,92 +475,27 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
}
|
||||
break;
|
||||
case Target::AVX:
|
||||
extern unsigned char stdlib_bitcode_avx[];
|
||||
extern int stdlib_bitcode_avx_length;
|
||||
lAddBitcode(stdlib_bitcode_avx, stdlib_bitcode_avx_length, module,
|
||||
symbolTable);
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx[];
|
||||
extern int builtins_bitcode_avx_length;
|
||||
lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module,
|
||||
symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx_x2[];
|
||||
extern int builtins_bitcode_avx_x2_length;
|
||||
lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error");
|
||||
}
|
||||
|
||||
// Add a declaration of void *ISPCMalloc(int64_t size, int alignment).
|
||||
// The user is responsible for linking in a definition of this if it's
|
||||
// needed by the compiled program.
|
||||
{ std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(llvm::Type::getInt64Ty(*ctx));
|
||||
argTypes.push_back(llvm::Type::getInt32Ty(*ctx));
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCMalloc", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Add a declaration of void ISPCFree(void *). The user is
|
||||
// responsible for linking in a definition of this if it's needed by
|
||||
// the compiled program.
|
||||
{ std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCFree", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Add a declaration of void ISPCLaunch(void *funcPtr, void *data).
|
||||
// The user is responsible for linking in a definition of this if it's
|
||||
// needed by the compiled program.
|
||||
{ std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCLaunch", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Add a declaration of void ISPCSync(). The user is responsible for
|
||||
// linking in a definition of this if it's needed by the compiled
|
||||
// program.
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCSync", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Add a declaration of void ISPCInstrument(void *, void *, int, int).
|
||||
// The user is responsible for linking in a definition of this if it's
|
||||
// needed by the compiled program.
|
||||
{
|
||||
std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
|
||||
argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
|
||||
argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
|
||||
argTypes.push_back(LLVMTypes::Int32Type);
|
||||
argTypes.push_back(LLVMTypes::Int32Type);
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCInstrument", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Declare various placeholder functions that the optimizer will later
|
||||
// find and replace with something more useful.
|
||||
lDeclarePseudoGathers(module);
|
||||
lDeclarePseudoScatters(module);
|
||||
lDeclarePseudoMaskedStore(module);
|
||||
|
||||
// define the 'programCount' builtin variable
|
||||
lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);
|
||||
|
||||
@@ -644,13 +513,20 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
symbolTable);
|
||||
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
||||
symbolTable);
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
|
||||
symbolTable);
|
||||
|
||||
if (includeStdlibISPC) {
|
||||
// If the user wants the standard library to be included, parse the
|
||||
// serialized version of the stdlib.ispc file to get its definitions
|
||||
// added.
|
||||
// serialized version of the stdlib.ispc file to get its
|
||||
// definitions added. Disable emission of performance warnings for
|
||||
// now, since the user doesn't care about any of that in the stdlib
|
||||
// implementation...
|
||||
bool epf = g->emitPerfWarnings;
|
||||
g->emitPerfWarnings = false;
|
||||
extern char stdlib_code[];
|
||||
yy_scan_string(stdlib_code);
|
||||
yyparse();
|
||||
g->emitPerfWarnings = epf;
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
32
contrib/ispc.vim
Normal file
32
contrib/ispc.vim
Normal file
@@ -0,0 +1,32 @@
|
||||
" Vim syntax file
|
||||
" Language: ISPC
|
||||
" Maintainer: Andreas Wendleder <andreas.wendleder@gmail.com>
|
||||
" Last Change: 2011 Aug 3
|
||||
|
||||
" Quit when a syntax file was already loaded
|
||||
if exists("b:current_syntax")
|
||||
finish
|
||||
endif
|
||||
|
||||
" Read the C syntax to start with
|
||||
runtime! syntax/c.vim
|
||||
unlet b:current_syntax
|
||||
|
||||
" New keywords
|
||||
syn keyword ispcStatement cbreak ccontinue creturn launch print reference soa sync task
|
||||
syn keyword ispcConditional cif
|
||||
syn keyword ispcRepeat cdo cfor cwhile
|
||||
syn keyword ispcBuiltin programCount programIndex
|
||||
syn keyword ispcType export int8 int16 int32 int64
|
||||
|
||||
" Default highlighting
|
||||
command -nargs=+ HiLink hi def link <args>
|
||||
HiLink ispcStatement Statement
|
||||
HiLink ispcConditional Conditional
|
||||
HiLink ispcRepeat Repeat
|
||||
HiLink ispcBuiltin Statement
|
||||
HiLink ispcType Type
|
||||
delcommand HiLink
|
||||
|
||||
let b:current_syntax = "ispc"
|
||||
|
||||
363
ctx.cpp
363
ctx.cpp
@@ -144,16 +144,20 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
||||
returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
|
||||
StoreInst(LLVMMaskAllOff, returnedLanesPtr);
|
||||
|
||||
launchedTasks = false;
|
||||
launchGroupHandlePtr = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle");
|
||||
StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType),
|
||||
launchGroupHandlePtr);
|
||||
|
||||
if (!returnType || returnType == AtomicType::Void)
|
||||
returnValuePtr = NULL;
|
||||
else {
|
||||
const llvm::Type *ftype = returnType->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *ftype = returnType->LLVMType(g->ctx);
|
||||
returnValuePtr = AllocaInst(ftype, "return_value_memory");
|
||||
// FIXME: don't do this store???
|
||||
StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
|
||||
}
|
||||
|
||||
#ifndef LLVM_2_8
|
||||
if (m->diBuilder) {
|
||||
/* If debugging is enabled, tell the debug information emission
|
||||
code about this new function */
|
||||
@@ -174,16 +178,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
||||
/* And start a scope representing the initial function scope */
|
||||
StartScope();
|
||||
}
|
||||
#endif // LLVM_2_8
|
||||
|
||||
launchedTasks = false;
|
||||
|
||||
// connect the funciton's mask memory to the __mask symbol
|
||||
Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
|
||||
assert(maskSymbol != NULL);
|
||||
maskSymbol->storagePtr = maskPtr;
|
||||
|
||||
#ifndef LLVM_2_8
|
||||
// add debugging info for __mask, programIndex, ...
|
||||
if (m->diBuilder) {
|
||||
maskSymbol->pos = funcStartPos;
|
||||
@@ -208,15 +208,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
||||
true /* static */,
|
||||
programCountSymbol->storagePtr);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
FunctionEmitContext::~FunctionEmitContext() {
|
||||
assert(controlFlowInfo.size() == 0);
|
||||
#ifndef LLVM_2_8
|
||||
assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -704,6 +701,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||
#if 0
|
||||
// Compare the two masks to get a vector of i1s
|
||||
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||
v1, v2, "v1==v2");
|
||||
@@ -711,6 +709,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||
cmp = I1VecToBoolVec(cmp);
|
||||
// And see if it's all on
|
||||
return All(cmp);
|
||||
#else
|
||||
llvm::Value *mm1 = LaneMask(v1);
|
||||
llvm::Value *mm2 = LaneMask(v2);
|
||||
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
|
||||
"v1==v2");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -735,11 +739,12 @@ FunctionEmitContext::CreateBasicBlock(const char *name) {
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
|
||||
const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(b->getType());
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
|
||||
if (at) {
|
||||
// If we're given an array of vectors of i1s, then do the
|
||||
// conversion for each of the elements
|
||||
const llvm::Type *boolArrayType =
|
||||
LLVM_TYPE_CONST llvm::Type *boolArrayType =
|
||||
llvm::ArrayType::get(LLVMTypes::BoolVectorType, at->getNumElements());
|
||||
llvm::Value *ret = llvm::UndefValue::get(boolArrayType);
|
||||
|
||||
@@ -757,35 +762,24 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::EmitMalloc(const llvm::Type *ty, int align) {
|
||||
FunctionEmitContext::SizeOf(LLVM_TYPE_CONST llvm::Type *ty) {
|
||||
// Emit code to compute the size of the given type using a GEP with a
|
||||
// NULL base pointer, indexing one element of the given type, and
|
||||
// casting the resulting 'pointer' to an int giving its size.
|
||||
const llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
|
||||
LLVM_TYPE_CONST llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
|
||||
llvm::Value *nullPtr = llvm::Constant::getNullValue(ptrType);
|
||||
llvm::Value *index[1] = { LLVMInt32(1) };
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
|
||||
llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, arrayRef,
|
||||
"offset_ptr", bblock);
|
||||
#else
|
||||
llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, &index[0], &index[1],
|
||||
"offset_ptr", bblock);
|
||||
#endif
|
||||
AddDebugPos(poffset);
|
||||
llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
|
||||
|
||||
// And given the size, call the malloc function
|
||||
llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
|
||||
assert(fmalloc != NULL);
|
||||
llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align),
|
||||
"raw_argmem");
|
||||
// Cast the void * back to the result pointer type
|
||||
return BitCastInst(mem, ptrType, "mem_bitcast");
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EmitFree(llvm::Value *ptr) {
|
||||
llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType,
|
||||
"argmemfree");
|
||||
llvm::Function *ffree = m->module->getFunction("ISPCFree");
|
||||
assert(ffree != NULL);
|
||||
CallInst(ffree, freeArg);
|
||||
return sizeOf;
|
||||
}
|
||||
|
||||
|
||||
@@ -797,8 +791,13 @@ lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
sConstant, s);
|
||||
llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(0) };
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
|
||||
return llvm::GetElementPtrInst::Create(sPtr, arrayRef, "sptr", bblock);
|
||||
#else
|
||||
return llvm::GetElementPtrInst::Create(sPtr, &indices[0], &indices[2],
|
||||
"sptr", bblock);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -838,7 +837,6 @@ FunctionEmitContext::GetDebugPos() const {
|
||||
void
|
||||
FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
|
||||
llvm::DIScope *scope) {
|
||||
#ifndef LLVM_2_8
|
||||
llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
|
||||
if (inst != NULL && m->diBuilder) {
|
||||
SourcePos p = pos ? *pos : currentPos;
|
||||
@@ -849,13 +847,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
|
||||
inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column,
|
||||
scope ? *scope : GetDIScope()));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::StartScope() {
|
||||
#ifndef LLVM_2_8
|
||||
if (m->diBuilder != NULL) {
|
||||
llvm::DIScope parentScope;
|
||||
if (debugScopes.size() > 0)
|
||||
@@ -869,18 +865,15 @@ FunctionEmitContext::StartScope() {
|
||||
currentPos.first_column);
|
||||
debugScopes.push_back(lexicalBlock);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndScope() {
|
||||
#ifndef LLVM_2_8
|
||||
if (m->diBuilder != NULL) {
|
||||
assert(debugScopes.size() > 0);
|
||||
debugScopes.pop_back();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -893,7 +886,6 @@ FunctionEmitContext::GetDIScope() const {
|
||||
|
||||
void
|
||||
FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
|
||||
#ifndef LLVM_2_8
|
||||
if (m->diBuilder == NULL)
|
||||
return;
|
||||
|
||||
@@ -909,13 +901,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
|
||||
llvm::Instruction *declareInst =
|
||||
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
|
||||
AddDebugPos(declareInst, &sym->pos, &scope);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
|
||||
#ifndef LLVM_2_8
|
||||
if (m->diBuilder == NULL)
|
||||
return;
|
||||
|
||||
@@ -931,7 +921,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
|
||||
llvm::Instruction *declareInst =
|
||||
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
|
||||
AddDebugPos(declareInst, &sym->pos, &scope);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -941,15 +930,16 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
|
||||
Otherwise return zero.
|
||||
*/
|
||||
static int
|
||||
lArrayVectorWidth(const llvm::Type *t) {
|
||||
const llvm::ArrayType *arrayType = llvm::dyn_cast<const llvm::ArrayType>(t);
|
||||
lArrayVectorWidth(LLVM_TYPE_CONST llvm::Type *t) {
|
||||
LLVM_TYPE_CONST llvm::ArrayType *arrayType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(t);
|
||||
if (arrayType == NULL)
|
||||
return 0;
|
||||
|
||||
// We shouldn't be seeing arrays of anything but vectors being passed
|
||||
// to things like FunctionEmitContext::BinaryOperator() as operands
|
||||
const llvm::VectorType *vectorElementType =
|
||||
llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
|
||||
LLVM_TYPE_CONST llvm::VectorType *vectorElementType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
|
||||
assert(vectorElementType != NULL &&
|
||||
(int)vectorElementType->getNumElements() == g->target.vectorWidth);
|
||||
return (int)arrayType->getNumElements();
|
||||
@@ -966,7 +956,7 @@ FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst,
|
||||
}
|
||||
|
||||
assert(v0->getType() == v1->getType());
|
||||
const llvm::Type *type = v0->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *type = v0->getType();
|
||||
int arraySize = lArrayVectorWidth(type);
|
||||
if (arraySize == 0) {
|
||||
llvm::Instruction *bop =
|
||||
@@ -1000,7 +990,7 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
|
||||
// Similarly to BinaryOperator, do the operation on all the elements of
|
||||
// the array if we're given an array type; otherwise just do the
|
||||
// regular llvm operation.
|
||||
const llvm::Type *type = v->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *type = v->getType();
|
||||
int arraySize = lArrayVectorWidth(type);
|
||||
if (arraySize == 0) {
|
||||
llvm::Instruction *binst =
|
||||
@@ -1025,20 +1015,20 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
|
||||
// Given the llvm Type that represents an ispc VectorType, return an
|
||||
// equally-shaped type with boolean elements. (This is the type that will
|
||||
// be returned from CmpInst with ispc VectorTypes).
|
||||
static const llvm::Type *
|
||||
lGetMatchingBoolVectorType(const llvm::Type *type) {
|
||||
const llvm::ArrayType *arrayType =
|
||||
llvm::dyn_cast<const llvm::ArrayType>(type);
|
||||
static LLVM_TYPE_CONST llvm::Type *
|
||||
lGetMatchingBoolVectorType(LLVM_TYPE_CONST llvm::Type *type) {
|
||||
LLVM_TYPE_CONST llvm::ArrayType *arrayType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
|
||||
// should only be called for vector typed stuff...
|
||||
assert(arrayType != NULL);
|
||||
|
||||
const llvm::VectorType *vectorElementType =
|
||||
llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
|
||||
LLVM_TYPE_CONST llvm::VectorType *vectorElementType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
|
||||
assert(vectorElementType != NULL &&
|
||||
(int)vectorElementType->getNumElements() == g->target.vectorWidth);
|
||||
|
||||
const llvm::Type *base = llvm::VectorType::get(LLVMTypes::BoolType,
|
||||
g->target.vectorWidth);
|
||||
LLVM_TYPE_CONST llvm::Type *base =
|
||||
llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
|
||||
return llvm::ArrayType::get(base, arrayType->getNumElements());
|
||||
}
|
||||
|
||||
@@ -1054,7 +1044,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
|
||||
}
|
||||
|
||||
assert(v0->getType() == v1->getType());
|
||||
const llvm::Type *type = v0->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *type = v0->getType();
|
||||
int arraySize = lArrayVectorWidth(type);
|
||||
if (arraySize == 0) {
|
||||
llvm::Instruction *ci =
|
||||
@@ -1064,7 +1054,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
|
||||
return ci;
|
||||
}
|
||||
else {
|
||||
const llvm::Type *boolType = lGetMatchingBoolVectorType(type);
|
||||
LLVM_TYPE_CONST llvm::Type *boolType = lGetMatchingBoolVectorType(type);
|
||||
llvm::Value *ret = llvm::UndefValue::get(boolType);
|
||||
for (int i = 0; i < arraySize; ++i) {
|
||||
llvm::Value *a = ExtractInst(v0, i);
|
||||
@@ -1078,16 +1068,17 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const llvm::Type *valType = value->getType();
|
||||
const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
|
||||
LLVM_TYPE_CONST llvm::Type *valType = value->getType();
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
|
||||
// If we're bitcasting an array of pointers, we have a varying
|
||||
// lvalue; apply the corresponding bitcast to each of the
|
||||
// individual pointers and return the result array.
|
||||
@@ -1112,16 +1103,17 @@ FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const llvm::Type *valType = value->getType();
|
||||
const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
|
||||
LLVM_TYPE_CONST llvm::Type *valType = value->getType();
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
|
||||
// varying lvalue -> apply ptr to int to the individual pointers
|
||||
assert((int)at->getNumElements() == g->target.vectorWidth);
|
||||
|
||||
@@ -1144,16 +1136,17 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const llvm::Type *valType = value->getType();
|
||||
const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
|
||||
LLVM_TYPE_CONST llvm::Type *valType = value->getType();
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
|
||||
// varying lvalue -> apply int to ptr to the individual pointers
|
||||
assert((int)at->getNumElements() == g->target.vectorWidth);
|
||||
|
||||
@@ -1176,7 +1169,7 @@ FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
@@ -1194,7 +1187,7 @@ FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
const llvm::Type *type, const char *name) {
|
||||
LLVM_TYPE_CONST llvm::Type *type, const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
@@ -1210,7 +1203,7 @@ FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
@@ -1227,7 +1220,7 @@ FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
@@ -1244,7 +1237,7 @@ FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::ZExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
@@ -1270,22 +1263,30 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
|
||||
|
||||
// FIXME: do we need need to handle the case of the first index being
|
||||
// varying? It's currently needed...
|
||||
assert(!llvm::isa<const llvm::VectorType>(index0->getType()));
|
||||
assert(!llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index0->getType()));
|
||||
|
||||
const llvm::Type *basePtrType = basePtr->getType();
|
||||
const llvm::ArrayType *baseArrayType =
|
||||
llvm::dyn_cast<const llvm::ArrayType>(basePtrType);
|
||||
LLVM_TYPE_CONST llvm::Type *basePtrType = basePtr->getType();
|
||||
LLVM_TYPE_CONST llvm::ArrayType *baseArrayType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(basePtrType);
|
||||
bool baseIsVaryingTypePointer = (baseArrayType != NULL) &&
|
||||
llvm::isa<const llvm::PointerType>(baseArrayType->getElementType());
|
||||
bool indexIsVaryingType = llvm::isa<const llvm::VectorType>(index1->getType());
|
||||
llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(baseArrayType->getElementType());
|
||||
bool indexIsVaryingType =
|
||||
llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index1->getType());
|
||||
|
||||
if (!indexIsVaryingType && !baseIsVaryingTypePointer) {
|
||||
// The easy case: both the base pointer and the indices are
|
||||
// uniform, so just emit the regular LLVM GEP instruction
|
||||
llvm::Value *indices[2] = { index0, index1 };
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
|
||||
llvm::Instruction *inst =
|
||||
llvm::GetElementPtrInst::Create(basePtr, arrayRef,
|
||||
name ? name : "gep", bblock);
|
||||
#else
|
||||
llvm::Instruction *inst =
|
||||
llvm::GetElementPtrInst::Create(basePtr, &indices[0], &indices[2],
|
||||
name ? name : "gep", bblock);
|
||||
#endif
|
||||
AddDebugPos(inst);
|
||||
return inst;
|
||||
}
|
||||
@@ -1316,9 +1317,10 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
|
||||
// This is kind of a hack: use the type from the GEP to
|
||||
// figure out the return type and the first time through,
|
||||
// create an undef value of that type here
|
||||
const llvm::PointerType *elementPtrType =
|
||||
llvm::dyn_cast<const llvm::PointerType>(eltPtr->getType());
|
||||
const llvm::Type *elementType = elementPtrType->getElementType();
|
||||
LLVM_TYPE_CONST llvm::PointerType *elementPtrType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(eltPtr->getType());
|
||||
LLVM_TYPE_CONST llvm::Type *elementType =
|
||||
elementPtrType->getElementType();
|
||||
lret = llvm::UndefValue::get(LLVMPointerVectorType(elementType));
|
||||
}
|
||||
|
||||
@@ -1345,7 +1347,7 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (llvm::isa<const llvm::PointerType>(lvalue->getType())) {
|
||||
if (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType())) {
|
||||
// If the lvalue is a straight up regular pointer, then just issue
|
||||
// a regular load. First figure out the alignment; in general we
|
||||
// can just assume the natural alignment (0 here), but for varying
|
||||
@@ -1372,7 +1374,7 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
|
||||
// information we need from the LLVM::Type, so have to carry the
|
||||
// ispc type in through this path..
|
||||
assert(type != NULL);
|
||||
assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
|
||||
assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
|
||||
return gather(lvalue, type, name);
|
||||
}
|
||||
}
|
||||
@@ -1382,9 +1384,9 @@ llvm::Value *
|
||||
FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
|
||||
const char *name) {
|
||||
// We should have a varying lvalue if we get here...
|
||||
assert(llvm::dyn_cast<const llvm::ArrayType>(lvalue->getType()));
|
||||
assert(llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
|
||||
|
||||
const llvm::Type *retType = type->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *retType = type->LLVMType(g->ctx);
|
||||
|
||||
const StructType *st = dynamic_cast<const StructType *>(type);
|
||||
if (st) {
|
||||
@@ -1410,7 +1412,7 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
|
||||
// the GEP stuff in the loop below ends up computing pointers based
|
||||
// on elements in the vectors rather than incorrectly advancing to
|
||||
// the next vector...
|
||||
const llvm::Type *eltType =
|
||||
LLVM_TYPE_CONST llvm::Type *eltType =
|
||||
vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
|
||||
lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));
|
||||
|
||||
@@ -1441,17 +1443,20 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
|
||||
llvm::Value *mask = GetMask();
|
||||
llvm::Function *gather = NULL;
|
||||
// Figure out which gather function to call based on the size of
|
||||
// the elements; will need to generalize this for 8 and 16-bit
|
||||
// types.
|
||||
// the elements.
|
||||
if (retType == LLVMTypes::DoubleVectorType ||
|
||||
retType == LLVMTypes::Int64VectorType)
|
||||
gather = m->module->getFunction("__pseudo_gather_64");
|
||||
else {
|
||||
assert(retType == LLVMTypes::FloatVectorType ||
|
||||
retType == LLVMTypes::Int32VectorType);
|
||||
else if (retType == LLVMTypes::FloatVectorType ||
|
||||
retType == LLVMTypes::Int32VectorType)
|
||||
gather = m->module->getFunction("__pseudo_gather_32");
|
||||
else if (retType == LLVMTypes::Int16VectorType)
|
||||
gather = m->module->getFunction("__pseudo_gather_16");
|
||||
else {
|
||||
assert(retType == LLVMTypes::Int8VectorType);
|
||||
gather = m->module->getFunction("__pseudo_gather_8");
|
||||
}
|
||||
assert(gather);
|
||||
assert(gather != NULL);
|
||||
|
||||
llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
|
||||
llvm::Instruction *call = CallInst(gather, voidlvalue, mask, name);
|
||||
@@ -1473,33 +1478,21 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
|
||||
void
|
||||
FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
|
||||
llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
|
||||
#ifdef LLVM_2_8
|
||||
llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
|
||||
#else
|
||||
llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
|
||||
#endif
|
||||
inst->setMetadata("filename", md);
|
||||
|
||||
llvm::Value *line = LLVMInt32(pos.first_line);
|
||||
#ifdef LLVM_2_8
|
||||
md = llvm::MDNode::get(*g->ctx, &line, 1);
|
||||
#else
|
||||
md = llvm::MDNode::get(*g->ctx, line);
|
||||
#endif
|
||||
inst->setMetadata("line", md);
|
||||
|
||||
llvm::Value *column = LLVMInt32(pos.first_column);
|
||||
#ifdef LLVM_2_8
|
||||
md = llvm::MDNode::get(*g->ctx, &column, 1);
|
||||
#else
|
||||
md = llvm::MDNode::get(*g->ctx, column);
|
||||
#endif
|
||||
inst->setMetadata("column", md);
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
|
||||
FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name,
|
||||
int align, bool atEntryBlock) {
|
||||
llvm::AllocaInst *inst = NULL;
|
||||
if (atEntryBlock) {
|
||||
@@ -1519,9 +1512,10 @@ FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
|
||||
// unlikely that this array will be loaded into varying variables with
|
||||
// what will be aligned accesses if the uniform -> varying load is done
|
||||
// in regular chunks.
|
||||
const llvm::ArrayType *arrayType = llvm::dyn_cast<const llvm::ArrayType>(llvmType);
|
||||
LLVM_TYPE_CONST llvm::ArrayType *arrayType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(llvmType);
|
||||
if (align == 0 && arrayType != NULL &&
|
||||
!llvm::isa<const llvm::VectorType>(arrayType->getElementType()))
|
||||
!llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType()))
|
||||
align = 4 * g->target.nativeVectorWidth;
|
||||
|
||||
if (align != 0)
|
||||
@@ -1546,7 +1540,7 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
return;
|
||||
}
|
||||
|
||||
assert(llvm::isa<const llvm::PointerType>(lvalue->getType()));
|
||||
assert(llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType()));
|
||||
|
||||
const CollectionType *collectionType =
|
||||
dynamic_cast<const CollectionType *>(rvalueType);
|
||||
@@ -1570,9 +1564,7 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
rvalueType = rvalueType->GetAsNonConstType();
|
||||
|
||||
llvm::Function *maskedStoreFunc = NULL;
|
||||
// Figure out if we need a 32-bit or 64-bit masked store. This
|
||||
// will need to be generalized when/if 8 and 16-bit data types are
|
||||
// added.
|
||||
// Figure out if we need a 8, 16, 32 or 64-bit masked store.
|
||||
if (rvalueType == AtomicType::VaryingDouble ||
|
||||
rvalueType == AtomicType::VaryingInt64 ||
|
||||
rvalueType == AtomicType::VaryingUInt64) {
|
||||
@@ -1582,13 +1574,11 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType,
|
||||
"rvalue_to_int64");
|
||||
}
|
||||
else {
|
||||
assert(rvalueType == AtomicType::VaryingFloat ||
|
||||
rvalueType == AtomicType::VaryingBool ||
|
||||
rvalueType == AtomicType::VaryingInt32 ||
|
||||
rvalueType == AtomicType::VaryingUInt32 ||
|
||||
dynamic_cast<const EnumType *>(rvalueType) != NULL);
|
||||
|
||||
else if (rvalueType == AtomicType::VaryingFloat ||
|
||||
rvalueType == AtomicType::VaryingBool ||
|
||||
rvalueType == AtomicType::VaryingInt32 ||
|
||||
rvalueType == AtomicType::VaryingUInt32 ||
|
||||
dynamic_cast<const EnumType *>(rvalueType) != NULL) {
|
||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
|
||||
lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType,
|
||||
"lvalue_to_int32vecptr");
|
||||
@@ -1596,6 +1586,18 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType,
|
||||
"rvalue_to_int32");
|
||||
}
|
||||
else if (rvalueType == AtomicType::VaryingInt16 ||
|
||||
rvalueType == AtomicType::VaryingUInt16) {
|
||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
|
||||
lvalue = BitCastInst(lvalue, LLVMTypes::Int16VectorPointerType,
|
||||
"lvalue_to_int16vecptr");
|
||||
}
|
||||
else if (rvalueType == AtomicType::VaryingInt8 ||
|
||||
rvalueType == AtomicType::VaryingUInt8) {
|
||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
|
||||
lvalue = BitCastInst(lvalue, LLVMTypes::Int8VectorPointerType,
|
||||
"lvalue_to_int8vecptr");
|
||||
}
|
||||
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(lvalue);
|
||||
@@ -1616,7 +1618,7 @@ void
|
||||
FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
llvm::Value *storeMask, const Type *rvalueType) {
|
||||
assert(rvalueType->IsVaryingType());
|
||||
assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
|
||||
assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
|
||||
|
||||
const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
|
||||
if (structType) {
|
||||
@@ -1635,7 +1637,8 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
// the GEP stuff in the loop below ends up computing pointers based
|
||||
// on elements in the vectors rather than incorrectly advancing to
|
||||
// the next vector...
|
||||
const llvm::Type *eltType = vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *eltType =
|
||||
vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
|
||||
lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));
|
||||
|
||||
for (int i = 0; i < vt->GetElementCount(); ++i) {
|
||||
@@ -1653,20 +1656,21 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);
|
||||
|
||||
llvm::Function *func = NULL;
|
||||
const llvm::Type *type = rvalue->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *type = rvalue->getType();
|
||||
if (type == LLVMTypes::DoubleVectorType ||
|
||||
type == LLVMTypes::Int64VectorType) {
|
||||
func = m->module->getFunction("__pseudo_scatter_64");
|
||||
rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int");
|
||||
}
|
||||
else {
|
||||
// FIXME: if this hits, presumably it's due to needing int8 and/or
|
||||
// int16 versions of scatter...
|
||||
assert(type == LLVMTypes::FloatVectorType ||
|
||||
type == LLVMTypes::Int32VectorType);
|
||||
else if (type == LLVMTypes::FloatVectorType ||
|
||||
type == LLVMTypes::Int32VectorType) {
|
||||
func = m->module->getFunction("__pseudo_scatter_32");
|
||||
rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int");
|
||||
}
|
||||
else if (type == LLVMTypes::Int16VectorType)
|
||||
func = m->module->getFunction("__pseudo_scatter_16");
|
||||
else if (type == LLVMTypes::Int8VectorType)
|
||||
func = m->module->getFunction("__pseudo_scatter_8");
|
||||
assert(func != NULL);
|
||||
|
||||
AddInstrumentationPoint("scatter");
|
||||
@@ -1720,7 +1724,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock);
|
||||
AddDebugPos(si);
|
||||
}
|
||||
else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
|
||||
else if (llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()))
|
||||
// We have a varying lvalue (an array of pointers), so it's time to
|
||||
// scatter
|
||||
scatter(rvalue, lvalue, storeMask, rvalueType);
|
||||
@@ -1764,7 +1768,7 @@ FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const char *name) {
|
||||
}
|
||||
|
||||
llvm::Instruction *ei = NULL;
|
||||
if (llvm::isa<const llvm::VectorType>(v->getType()))
|
||||
if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
|
||||
ei = llvm::ExtractElementInst::Create(v, LLVMInt32(elt),
|
||||
name ? name : "extract", bblock);
|
||||
else
|
||||
@@ -1784,7 +1788,7 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
}
|
||||
|
||||
llvm::Instruction *ii = NULL;
|
||||
if (llvm::isa<const llvm::VectorType>(v->getType()))
|
||||
if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
|
||||
ii = llvm::InsertElementInst::Create(v, eltVal, LLVMInt32(elt),
|
||||
name ? name : "insert", bblock);
|
||||
else
|
||||
@@ -1796,12 +1800,12 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
|
||||
|
||||
llvm::PHINode *
|
||||
FunctionEmitContext::PhiNode(const llvm::Type *type, int count,
|
||||
FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
||||
const char *name) {
|
||||
llvm::PHINode *pn = llvm::PHINode::Create(type,
|
||||
#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
count,
|
||||
#endif // !LLVM_2_8 && !LLVM_2_9
|
||||
#endif // LLVM_3_0
|
||||
name ? name : "phi", bblock);
|
||||
AddDebugPos(pn);
|
||||
return pn;
|
||||
@@ -1833,9 +1837,14 @@ FunctionEmitContext::CallInst(llvm::Function *func,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, args, name ? name : "", bblock);
|
||||
#else
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, args.begin(), args.end(),
|
||||
name ? name : "", bblock);
|
||||
#endif
|
||||
AddDebugPos(ci);
|
||||
return ci;
|
||||
}
|
||||
@@ -1849,10 +1858,15 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, arg, name ? name : "", bblock);
|
||||
#else
|
||||
llvm::Value *args[] = { arg };
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, &args[0], &args[1], name ? name : "",
|
||||
bblock);
|
||||
#endif
|
||||
AddDebugPos(ci);
|
||||
return ci;
|
||||
}
|
||||
@@ -1867,9 +1881,16 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
|
||||
}
|
||||
|
||||
llvm::Value *args[] = { arg0, arg1 };
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::ArrayRef<llvm::Value *> argArrayRef(&args[0], &args[2]);
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, argArrayRef, name ? name : "",
|
||||
bblock);
|
||||
#else
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, &args[0], &args[2], name ? name : "",
|
||||
bblock);
|
||||
#endif
|
||||
AddDebugPos(ci);
|
||||
return ci;
|
||||
}
|
||||
@@ -1877,15 +1898,9 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::ReturnInst() {
|
||||
if (launchedTasks) {
|
||||
// Automatically add a sync call at the end of any function that
|
||||
// launched tasks
|
||||
SourcePos noPos;
|
||||
noPos.name = "__auto_sync";
|
||||
ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos);
|
||||
es->EmitCode(this);
|
||||
delete es;
|
||||
}
|
||||
if (launchedTasks)
|
||||
// Add a sync call at the end of any function that launched tasks
|
||||
SyncInst();
|
||||
|
||||
llvm::Instruction *rinst = NULL;
|
||||
if (returnValuePtr != NULL) {
|
||||
@@ -1908,7 +1923,8 @@ FunctionEmitContext::ReturnInst() {
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
||||
std::vector<llvm::Value *> &argVals) {
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount) {
|
||||
if (callee == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
@@ -1916,28 +1932,24 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
||||
|
||||
launchedTasks = true;
|
||||
|
||||
const llvm::Type *argType = callee->arg_begin()->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *argType = callee->arg_begin()->getType();
|
||||
assert(llvm::PointerType::classof(argType));
|
||||
const llvm::PointerType *pt = static_cast<const llvm::PointerType *>(argType);
|
||||
LLVM_TYPE_CONST llvm::PointerType *pt =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(argType);
|
||||
assert(llvm::StructType::classof(pt->getElementType()));
|
||||
const llvm::StructType *argStructType =
|
||||
static_cast<const llvm::StructType *>(pt->getElementType());
|
||||
LLVM_TYPE_CONST llvm::StructType *argStructType =
|
||||
static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
|
||||
assert(argStructType->getNumElements() == argVals.size() + 1);
|
||||
|
||||
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||
assert(falloc != NULL);
|
||||
int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
// Use malloc() to allocate storage on Windows, since the stack is
|
||||
// generally not big enough there to do enough allocations for lots of
|
||||
// tasks and then things crash horribly...
|
||||
llvm::Value *argmem = EmitMalloc(argStructType, align);
|
||||
#else
|
||||
// Use alloca for space for the task args on OSX And Linux. KEY
|
||||
// DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so
|
||||
// that the alloca doesn't happen just once at the top of the function,
|
||||
// but happens each time the enclosing basic block executes.
|
||||
llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
|
||||
std::vector<llvm::Value *> allocArgs;
|
||||
allocArgs.push_back(launchGroupHandlePtr);
|
||||
allocArgs.push_back(SizeOf(argStructType));
|
||||
allocArgs.push_back(LLVMInt32(align));
|
||||
llvm::Value *voidmem = CallInst(falloc, allocArgs, "args_ptr");
|
||||
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||
|
||||
// Copy the values of the parameters into the appropriate place in
|
||||
// the argument block
|
||||
@@ -1959,5 +1971,32 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
||||
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||
assert(flaunch != NULL);
|
||||
return CallInst(flaunch, fptr, voidmem, "");
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(launchGroupHandlePtr);
|
||||
args.push_back(fptr);
|
||||
args.push_back(voidmem);
|
||||
args.push_back(launchCount);
|
||||
return CallInst(flaunch, args, "");
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::SyncInst() {
|
||||
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL);
|
||||
llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||
llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
|
||||
llvm::CmpInst::ICMP_NE,
|
||||
launchGroupHandle, nullPtrValue);
|
||||
llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
|
||||
llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
|
||||
BranchInst(bSync, bPostSync, nonNull);
|
||||
|
||||
SetCurrentBasicBlock(bSync);
|
||||
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||
if (fsync == NULL)
|
||||
FATAL("Couldn't find ISPCSync declaration?!");
|
||||
CallInst(fsync, launchGroupHandle, "");
|
||||
BranchInst(bPostSync);
|
||||
|
||||
SetCurrentBasicBlock(bPostSync);
|
||||
}
|
||||
|
||||
42
ctx.h
42
ctx.h
@@ -210,15 +210,8 @@ public:
|
||||
i32. */
|
||||
llvm::Value *I1VecToBoolVec(llvm::Value *b);
|
||||
|
||||
/** Emit code to call the user-supplied ISPCMalloc function to
|
||||
allocate space for an object of thee given type. Returns the
|
||||
pointer value returned by the ISPCMalloc call. */
|
||||
llvm::Value *EmitMalloc(const llvm::Type *ty, int align = 0);
|
||||
|
||||
/** Emit code to call the user-supplied ISPCFree function, passing it
|
||||
the given pointer to storage previously allocated by an
|
||||
EmitMalloc() call. */
|
||||
void EmitFree(llvm::Value *ptr);
|
||||
/** Returns the size of the given type. */
|
||||
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
|
||||
|
||||
/** If the user has asked to compile the program with instrumentation,
|
||||
this inserts a callback to the user-supplied instrumentation
|
||||
@@ -303,21 +296,21 @@ public:
|
||||
llvm::CmpInst::Predicate pred,
|
||||
llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
|
||||
|
||||
llvm::Value *BitCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
const llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
/** This GEP method is a generalization of the standard one in LLVM; it
|
||||
@@ -347,7 +340,7 @@ public:
|
||||
instruction is added at the start of the function in the entry
|
||||
basic block; if it should be added to the current basic block, then
|
||||
the atEntryBlock parameter should be false. */
|
||||
llvm::Value *AllocaInst(const llvm::Type *llvmType, const char *name = NULL,
|
||||
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
|
||||
int align = 0, bool atEntryBlock = true);
|
||||
|
||||
/** Standard store instruction; for this variant, the lvalue must be a
|
||||
@@ -378,7 +371,8 @@ public:
|
||||
llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::PHINode *PhiNode(const llvm::Type *type, int count, const char *name = NULL);
|
||||
llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
|
||||
llvm::Value *val1, const char *name = NULL);
|
||||
|
||||
@@ -398,7 +392,10 @@ public:
|
||||
/** Launch an asynchronous task to run the given function, passing it
|
||||
he given argument values. */
|
||||
llvm::Instruction *LaunchInst(llvm::Function *callee,
|
||||
std::vector<llvm::Value *> &argVals);
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount);
|
||||
|
||||
void SyncInst();
|
||||
|
||||
llvm::Instruction *ReturnInst();
|
||||
/** @} */
|
||||
@@ -488,6 +485,11 @@ private:
|
||||
/** True if a 'launch' statement has been encountered in the function. */
|
||||
bool launchedTasks;
|
||||
|
||||
/** This is a pointer to a void * that is passed to the ISPCLaunch(),
|
||||
ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
|
||||
tasks launched from the current function. */
|
||||
llvm::Value *launchGroupHandlePtr;
|
||||
|
||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||
static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
|
||||
bool ifsInLoopAllUniform() const;
|
||||
|
||||
2
decl.cpp
2
decl.cpp
@@ -237,7 +237,7 @@ Declarator::GetType(DeclSpecs *ds) const {
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, pos);
|
||||
Declarator *declarator = new Declarator(sym, sym->pos);
|
||||
sym->type = declarator->GetType(ds);
|
||||
sym->type = declarator->GetType(d->declSpecs);
|
||||
d->declarators.push_back(declarator);
|
||||
}
|
||||
else {
|
||||
|
||||
@@ -1,3 +1,138 @@
|
||||
=== v1.0.10 === (30 September 2011)
|
||||
|
||||
This release features an extensive new example showing the application of
|
||||
ispc to a deferred shading algorithm for scenes with thousands of lights
|
||||
(examples/deferred). This is an implementation of the algorithm that Johan
|
||||
Andersson described at SIGGRAPH 2009 and was implemented by Andrew
|
||||
Lauritzen and Jefferson Montgomery. The basic idea is that a pre-rendered
|
||||
G-buffer is partitioned into tiles, and in each tile, the set of lights
|
||||
that contribute to the tile is computed. Then, the pixels in the tile are
|
||||
then shaded using those light sources. (See slides 19-29 of
|
||||
http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
|
||||
for more details on the algorithm.)
|
||||
|
||||
The mechanism for launching tasks from ispc code has been generalized to
|
||||
allow multiple tasks to be launched with a single launch call (see
|
||||
http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
|
||||
information.)
|
||||
|
||||
A few new functions have been added to the standard library: num_cores()
|
||||
returns the number of cores in the system's CPU, and variants of all of the
|
||||
atomic operators that take 'uniform' values as parameters have been added.
|
||||
|
||||
=== v1.0.9 === (26 September 2011)
|
||||
|
||||
The binary release of v1.0.9 is the first that supports AVX code
|
||||
generation. Two targets are provided: "avx", which runs with a
|
||||
programCount of 8, and "avx-x2" which runs 16 program instances
|
||||
simultaneously. (This binary is also built using the in-progress LLVM 3.0
|
||||
development libraries, while previous ones have been built with the
|
||||
released 2.9 version of LLVM.)
|
||||
|
||||
This release has no other significant changes beyond a number of small
|
||||
bugfixes (https://github.com/ispc/ispc/issues/100,
|
||||
https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
|
||||
|
||||
=== v1.0.8 === (19 September 2011)
|
||||
|
||||
A number of improvements have been made to handling of 'if' statements in
|
||||
the language:
|
||||
- A bug was fixed where invalid memory could be incorrectly accessed even
|
||||
if none of the running program instances wanted to execute the
|
||||
corresponding instructions (https://github.com/ispc/ispc/issues/74).
|
||||
- The code generated for 'if' statements is a bit simpler and thus more
|
||||
efficient.
|
||||
|
||||
There is now '--pic' command-line argument that causes position-independent
|
||||
code to be generated (Linux and OSX only).
|
||||
|
||||
A number of additional performance improvements:
|
||||
- Loops are now unrolled by default; the --opt=disable-loop-unroll
|
||||
command-line argument can be used to disable this behavior.
|
||||
(https://github.com/ispc/ispc/issues/78)
|
||||
- A few more cases where gathers/scatters could be determined at compile
|
||||
time to actually access contiguous locations have been added.
|
||||
(https://github.com/ispc/ispc/issues/79)
|
||||
|
||||
Finally, warnings are now issued (if possible) when it can be determined
|
||||
at compile-time that an out-of-bounds array index is being used.
|
||||
(https://github.com/ispc/ispc/issues/98).
|
||||
|
||||
|
||||
=== v1.0.7 === (3 September 2011)
|
||||
|
||||
The various atomic_*_global() standard library functions are generally
|
||||
substantially more efficient. They all previously issued one hardware
|
||||
atomic instruction for each running program instance but now locally
|
||||
compute a reduction over the operands and issue a single hardware atomic,
|
||||
giving the same effect and results in the end (issue #57).
|
||||
|
||||
CPU/ISA target handling has been substantially improved. If no CPU is
|
||||
specified, the host CPU type is used, not just a default of "nehalem". A
|
||||
number of bugs were fixed that ensure that LLVM doesn't generate SSE>2
|
||||
instructions when using the SSE2 target (fixes issue #82).
|
||||
|
||||
Shift rights of unsigned integer types use a logical shift right
|
||||
instruction now, not an arithmetic shift right (fixed issue #88).
|
||||
|
||||
When emitting header files, 'extern' declarations of globals used in ispc
|
||||
code are now outside of the ispc namespace. Fixes issue #64.
|
||||
|
||||
The stencil example has been modified to do runs with and without
|
||||
parallelism.
|
||||
|
||||
Many other small bugfixes and improvements.
|
||||
|
||||
=== v1.0.6 === (17 August 2011)
|
||||
|
||||
Some additional cross-program instance operations have been added to the
|
||||
standard library. reduce_equal() checks to see if the given value is the
|
||||
same across all running program instances, and exclusive_scan_{and,or,and}()
|
||||
computes a scan over the given value in the running program instances.
|
||||
See the documentation of these new routines for more information:
|
||||
http://ispc.github.com/ispc.html#cross-program-instance-operations.
|
||||
|
||||
The simple task system implementations used in the examples have been
|
||||
improved. The Windows version no nlonger has a hard limit on the number of
|
||||
tasks that can be launched, and all versions have less dynamic memory
|
||||
allocation and less locking. More of the examples now have paths that also
|
||||
measure performance using tasks along with SPMD vectorization.
|
||||
|
||||
Two new examples have been added: one that shows the implementation of a
|
||||
ray-marching volume rendering algorithm, and one that shows a 3D stencil
|
||||
computation, as might be done for PDE solutions.
|
||||
|
||||
Standard library routines to issue prefetches have been added. See the
|
||||
documentation for more details: http://ispc.github.com/ispc.html#prefetches.
|
||||
|
||||
Fast versions of the float to half-precision float conversion routines have
|
||||
been added. For more details, see:
|
||||
http://ispc.github.com/ispc.html#conversions-to-and-from-half-precision-floats.
|
||||
|
||||
There is the usual set of small bug fixes. Notably, a number of details
|
||||
related to handling 32 versus 64 bit targets have been fixed, which in turn
|
||||
has fixed a bug related to tasks having incorrect values for pointers
|
||||
passed to them.
|
||||
|
||||
=== v1.0.5 === (1 August 2011)
|
||||
|
||||
Multi-element vector swizzles are supported; for example, given a 3-wide
|
||||
vector "foo", then expressions like "foo.zyx" and "foo.yz" can be used to
|
||||
construct other short vectors. See
|
||||
http://ispc.github.com/ispc.html#short-vector-types
|
||||
for more details. (Thanks to Pete Couperus for implementing this code!).
|
||||
|
||||
int8 and int16 datatypes are now supported. It is still generally more
|
||||
efficient to use int32 for intermediate computations, even if the in-memory
|
||||
format is int8 or int16.
|
||||
|
||||
There are now standard library routines to convert to and from 'half'-format
|
||||
floating-point values (half_to_float() and float_to_half()).
|
||||
|
||||
There is a new example with an implementation of Perlin's Noise function
|
||||
(examples/noise). It shows a speedup of approximately 4.2x versus a C
|
||||
implementation on OSX and a 2.9x speedup versus C on Windows.
|
||||
|
||||
=== v1.0.4 === (18 July 2011)
|
||||
|
||||
enums are now supported in ispc; see the section on enumeration types in
|
||||
|
||||
681
docs/ispc.txt
681
docs/ispc.txt
@@ -33,6 +33,17 @@ The main goals behind ``ispc`` are to:
|
||||
number of non-trivial workloads that aren't handled well by other
|
||||
compilation approaches (e.g. loop auto-vectorization.)
|
||||
|
||||
**We are very interested in your feedback and comments about ispc and
|
||||
in hearing your experiences using the system. We are especially interested
|
||||
in hearing if you try using ispc but see results that are not as you
|
||||
were expecting or hoping for.** We encourage you to send a note with your
|
||||
experiences or comments to the `ispc-users`_ mailing list or to file bug or
|
||||
feature requests with the ``ispc`` `bug tracker`_. (Thanks!)
|
||||
|
||||
.. _ispc-users: http://groups.google.com/group/ispc-users
|
||||
.. _bug tracker: https://github.com/ispc/ispc/issues?state=open
|
||||
|
||||
|
||||
Contents:
|
||||
|
||||
* `Recent Changes to ISPC`_
|
||||
@@ -69,7 +80,8 @@ Contents:
|
||||
+ `Program Instance Convergence`_
|
||||
+ `Data Races`_
|
||||
+ `Uniform Variables and Varying Control Flow`_
|
||||
+ `Task Parallelism in ISPC`_
|
||||
+ `Task Parallelism: Language Syntax`_
|
||||
+ `Task Parallelism: Runtime Requirements`_
|
||||
|
||||
* `The ISPC Standard Library`_
|
||||
|
||||
@@ -77,7 +89,10 @@ Contents:
|
||||
+ `Output Functions`_
|
||||
+ `Cross-Program Instance Operations`_
|
||||
+ `Packed Load and Store Operations`_
|
||||
+ `Conversions To and From Half-Precision Floats`_
|
||||
+ `Atomic Operations and Memory Fences`_
|
||||
+ `Prefetches`_
|
||||
+ `System Information`_
|
||||
+ `Low-Level Bits`_
|
||||
|
||||
* `Interoperability with the Application`_
|
||||
@@ -92,12 +107,16 @@ Contents:
|
||||
+ `Understanding How to Interoperate With the Application's Data`_
|
||||
+ `Communicating Between SPMD Program Instances`_
|
||||
+ `Gather and Scatter`_
|
||||
+ `8 and 16-bit Integer Types`_
|
||||
+ `Low-level Vector Tricks`_
|
||||
+ `Debugging`_
|
||||
+ `The "Fast math" Option`_
|
||||
+ `"Inline" Aggressively`_
|
||||
+ `Small Performance Tricks`_
|
||||
+ `Instrumenting Your ISPC Programs`_
|
||||
+ `Using Scan Operations For Variable Output`_
|
||||
+ `Application-Supplied Execution Masks`_
|
||||
+ `Explicit Vector Programming With Uniform Short Vector Types`_
|
||||
|
||||
* `Disclaimer and Legal Information`_
|
||||
|
||||
@@ -427,7 +446,8 @@ The following identifiers are reserved as language keywords: ``bool``,
|
||||
``char``, ``cif``, ``cwhile``, ``const``, ``continue``, ``creturn``,
|
||||
``default``, ``do``, ``double``, ``else``, ``enum``, ``export``,
|
||||
``extern``, ``false``, ``float``, ``for``, ``goto``, ``if``, ``inline``, ``int``,
|
||||
``int32``, ``int64``, ``launch``, ``print``, ``reference``, ``return``,
|
||||
``int8``, ``int16``, ``int32``, ``int64``, ``launch``, ``print``,
|
||||
``reference``, ``return``,
|
||||
``signed``, ``sizeof``, ``soa``, ``static``, ``struct``, ``switch``,
|
||||
``sync``, ``task``, ``true``, ``typedef``, ``uniform``, ``union``,
|
||||
``unsigned``, ``varying``, ``void``, ``volatile``, ``while``.
|
||||
@@ -481,6 +501,10 @@ types.
|
||||
* ``void``: "empty" type representing no value.
|
||||
* ``bool``: boolean value; may be assigned ``true``, ``false``, or the
|
||||
value of a boolean expression.
|
||||
* ``int8``: 8-bit signed integer.
|
||||
* ``unsigned int8``: 8-bit unsigned integer.
|
||||
* ``int16``: 16-bit signed integer.
|
||||
* ``unsigned int16``: 16-bit unsigned integer.
|
||||
* ``int``: 32-bit signed integer; may also be specified as ``int32``.
|
||||
* ``unsigned int``: 32-bit unsigned integer; may also be specified as
|
||||
``unsigned int32``.
|
||||
@@ -497,7 +521,8 @@ general" of the two types, with the following precedence:
|
||||
|
||||
::
|
||||
|
||||
double > uint64 > int64 > float > uint32 > int32 > bool
|
||||
double > uint64 > int64 > float > uint32 > int32 >
|
||||
uint16 > int16 > uint8 > int8 > bool
|
||||
|
||||
In other words, adding an ``int64`` to a ``double`` causes the ``int64`` to
|
||||
be converted to a ``double``, the addition to be performed, and a
|
||||
@@ -665,6 +690,15 @@ expect, though the two vector types must have the same length:
|
||||
int<4> bat = foo; // ERROR: different vector lengths
|
||||
float<4> bing = foo; // ERROR: different vector lengths
|
||||
|
||||
For convenience, short vectors can be initialized with a list of individual
|
||||
element values:
|
||||
|
||||
::
|
||||
|
||||
float x = ..., y = ..., z = ...;
|
||||
float<3> pos = { x, y, z };
|
||||
|
||||
|
||||
There are two mechanisms to access the individual elements of these short
|
||||
vector data types. The first is with the array indexing operator:
|
||||
|
||||
@@ -693,25 +727,24 @@ using the array indexing operator with an index that is greater than the
|
||||
vector size, accessing an element that is beyond the vector's size is
|
||||
undefined behavior and may cause your program to crash.
|
||||
|
||||
Note: ``ispc`` doesn't support the "swizzling" operations that languages
|
||||
like HLSL do. Only a single element of the vector can be accessed at a
|
||||
time with these member operators.
|
||||
It is also possible to construct new short vectors from other short vector
|
||||
values using this syntax, extended for "swizzling". For example,
|
||||
|
||||
::
|
||||
|
||||
float<3> foo = ...;
|
||||
float<2> bar = foo.xy; // ERROR
|
||||
foo.xz = ...; // ERROR
|
||||
func(foo.xyx); // ERROR
|
||||
float<3> position = ...;
|
||||
float<3> new_pos = position.zyx; // reverse order of components
|
||||
float<2> pos_2d = position.xy;
|
||||
|
||||
For convenience, short vectors can be initialized with a list of individual
|
||||
element values:
|
||||
Though a single element can be assigned to, as in the examples above, it is
|
||||
not currently possible to use swizzles on the left-hand side of assignment
|
||||
expressions:
|
||||
|
||||
::
|
||||
|
||||
float x = ..., y = ..., z = ...;
|
||||
float<3> pos = { x, y, z };
|
||||
|
||||
int8<2> foo = ...;
|
||||
int8<2> bar = ...;
|
||||
foo.yz = bar; // Error: can't assign to left-hand side of expression
|
||||
|
||||
Struct and Array Types
|
||||
----------------------
|
||||
@@ -806,8 +839,8 @@ by default. If a function is declared with a ``static`` qualifier, then it
|
||||
is only visible in the file in which it was declared.
|
||||
|
||||
Any function that can be launched with the ``launch`` construct in ``ispc``
|
||||
must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more
|
||||
discussion of launching tasks in ``ispc``.
|
||||
must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_
|
||||
for more discussion of launching tasks in ``ispc``.
|
||||
|
||||
Functions that are intended to be called from C/C++ application code must
|
||||
have the ``export`` qualifier. This causes them to have regular C linkage
|
||||
@@ -908,8 +941,9 @@ execution model is critical for writing efficient and correct programs in
|
||||
|
||||
``ispc`` supports both task parallelism to parallelize across multiple
|
||||
cores and SPMD parallelism to parallelize across the SIMD vector lanes on a
|
||||
single core. This section focuses on SPMD parallelism. See the section
|
||||
`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``.
|
||||
single core. This section focuses on SPMD parallelism. See the sections
|
||||
`Task Parallelism: Language Syntax`_ and `Task Parallelism: Runtime
|
||||
Requirements`_ for discussion of task parallelism in ``ispc``.
|
||||
|
||||
The SPMD-on-SIMD Execution Model
|
||||
--------------------------------
|
||||
@@ -1156,7 +1190,7 @@ This code implicitly assumes that ``programCount`` evenly divides
|
||||
::
|
||||
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
if (i + programIndex < programCount) {
|
||||
if (i + programIndex < count) {
|
||||
float d = data[i + programIndex];
|
||||
...
|
||||
|
||||
@@ -1352,112 +1386,190 @@ be modified in the above code even if *none* of the program instances
|
||||
evaluated a true value for the test, given the ``ispc`` execution model.
|
||||
|
||||
|
||||
Task Parallelism in ISPC
|
||||
------------------------
|
||||
Task Parallelism: Language Syntax
|
||||
---------------------------------
|
||||
|
||||
One option for combining task-parallelism with ``ispc`` is to just use
|
||||
regular task parallelism in the C/C++ application code (be it through
|
||||
Intel® Cilk(tm), Intel® Thread Building Blocks or another task system,
|
||||
etc.), and for tasks to use ``ispc`` for SPMD parallelism across the vector
|
||||
lanes as appropriate. Alternatively, ``ispc`` also has some support for
|
||||
launching tasks from ``ispc`` code. The approach is similar to Intel®
|
||||
Cilk's task launch feature. (See the ``examples/mandelbrot_tasks`` example
|
||||
to see it used in a non-trivial example.)
|
||||
Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and
|
||||
for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as
|
||||
appropriate. Alternatively, ``ispc`` also has support for launching tasks
|
||||
from ``ispc`` code. The approach is similar to Intel® Cilk's task launch
|
||||
feature. (See the ``examples/mandelbrot_tasks`` example to see it used in
|
||||
a small example.)
|
||||
|
||||
Any function that is launched as a task must be declared with the ``task``
|
||||
qualifier:
|
||||
First, any function that is launched as a task must be declared with the
|
||||
``task`` qualifier:
|
||||
|
||||
::
|
||||
|
||||
task void func(uniform float a[], uniform int start) {
|
||||
....
|
||||
task void func(uniform float a[], uniform int index) {
|
||||
...
|
||||
a[index] = ....
|
||||
}
|
||||
|
||||
Tasks must return ``void``; a compile time error is issued if a
|
||||
non-``void`` task is defined.
|
||||
|
||||
Given a task, one can then write code that launches tasks as follows:
|
||||
Given a task definitions, there are two ways to write code that launches
|
||||
tasks, using the ``launch`` construct. First, one task can be launched at
|
||||
a time, with parameters passed to the task to help it determine what part
|
||||
of the overall computation it's responsible for:
|
||||
|
||||
::
|
||||
|
||||
for (uniform int i = 0; i < 100; ++i)
|
||||
launch < func(a, i); >
|
||||
launch < func(a, i) >;
|
||||
|
||||
Note the ``launch`` keyword and the brackets around the function call.
|
||||
This code launches 100 tasks, each of which presumably does some
|
||||
computation keyed off of given the value ``i``. In general, one should
|
||||
launch many more tasks than there are processors in the system to
|
||||
computation that is keyed off of given the value ``i``. In general, one
|
||||
should launch many more tasks than there are processors in the system to
|
||||
ensure good load-balancing, but not so many that the overhead of scheduling
|
||||
and running tasks dominates the computation.
|
||||
|
||||
Program execution continues asynchronously after task launch; thus, the
|
||||
function shouldn't access values being generated by the tasks without
|
||||
synchronization. A function uses a ``sync`` statement to wait for all
|
||||
launched tasks to finish:
|
||||
Alternatively, a number of tasks may be launched from a single ``launch``
|
||||
statement. We might instead write the above example with a single
|
||||
``launch`` like this:
|
||||
|
||||
::
|
||||
|
||||
for (uniform int i = 0; i < 100; ++i)
|
||||
launch < func(a, i); >
|
||||
launch[100] < func2(a) >;
|
||||
|
||||
Where an integer value (not necessarily a compile-time constant) is
|
||||
provided to the ``launch`` keyword in square brackets; this number of tasks
|
||||
will be enqueued to be run asynchronously. Within each of the tasks, two
|
||||
special built-in variables are available--``taskIndex``, and ``taskCount``.
|
||||
The first, ``taskIndex``, ranges from zero to one minus the number of tasks
|
||||
provided to ``launch``, and ``taskCount`` equals the number of launched
|
||||
taks. Thus, we might use ``taskIndex`` in the implementation of ``func2``
|
||||
to determine which array element to process.
|
||||
|
||||
::
|
||||
|
||||
task void func2(uniform float a[]) {
|
||||
...
|
||||
a[taskIndex] = ...
|
||||
}
|
||||
|
||||
Program execution continues asynchronously after a ``launch`` statement;
|
||||
thus, a function shouldn't access values being generated by the tasks it
|
||||
has launched within the function without synchronization. If results are
|
||||
needed before function return, a function can use a ``sync`` statement to
|
||||
wait for all launched tasks to finish:
|
||||
|
||||
::
|
||||
|
||||
launch[100] < func2(a) >;
|
||||
sync;
|
||||
// now safe to use computed values in a[]...
|
||||
|
||||
Alternatively, any function that launches tasks has an implicit ``sync``
|
||||
before it returns, so that functions that call a function that launches
|
||||
tasks don't have to worry about outstanding asynchronous computation.
|
||||
Alternatively, any function that launches tasks has an automatically-added
|
||||
``sync`` statement before it returns, so that functions that call a
|
||||
function that launches tasks don't have to worry about outstanding
|
||||
asynchronous computation from that function.
|
||||
|
||||
Inside functions with the ``task`` qualifier, two additional built-in
|
||||
variables are provided: ``threadIndex`` and ``threadCount``.
|
||||
``threadCount`` gives the total number of hardware threads that have been
|
||||
launched by the task system. ``threadIndex`` provides an index between
|
||||
zero and ``threadCount-1`` that gives a unique index that corresponds to
|
||||
the hardware thread that is executing the current task. The
|
||||
``threadIndex`` can be used for accessing data that is private to the
|
||||
current thread and thus doesn't require synchronization to access under
|
||||
parallel execution.
|
||||
variables are provided in addition to ``taskIndex`` and ``taskCount``:
|
||||
``threadIndex`` and ``threadCount``. ``threadCount`` gives the total
|
||||
number of hardware threads that have been launched by the task system.
|
||||
``threadIndex`` provides an index between zero and ``threadCount-1`` that
|
||||
gives a unique index that corresponds to the hardware thread that is
|
||||
executing the current task. The ``threadIndex`` can be used for accessing
|
||||
data that is private to the current thread and thus doesn't require
|
||||
synchronization to access under parallel execution.
|
||||
|
||||
Task Parallelism: Runtime Requirements
|
||||
--------------------------------------
|
||||
|
||||
If you use the task launch feature in ``ispc``, you must provide C/C++
|
||||
implementations of two functions and link them into your final executable
|
||||
file. Although these functions may be implemented in either language, they
|
||||
must have "C" linkage (i.e. their prototypes must be declared inside an
|
||||
``extern "C"`` block if they are defined in C++.)
|
||||
implementations of three specific functions that manage launching and
|
||||
synchronizing parallel tasks; these functions must be linked into your
|
||||
executable. Although these functions may be implemented in any
|
||||
language, they must have "C" linkage (i.e. their prototypes must be
|
||||
declared inside an ``extern "C"`` block if they are defined in C++.)
|
||||
|
||||
By using user-supplied versions of these functions, ``ispc`` programs can
|
||||
easily interoperate with software systems that have existing task systems
|
||||
for managing parallelism. If you're using ``ispc`` with a system that
|
||||
isn't otherwise multi-threaded and don't want to write custom
|
||||
implementations of them, you can use the implementations of these functions
|
||||
provided in the ``examples/tasksys.cpp`` file in the ``ispc``
|
||||
distributions.
|
||||
|
||||
If you are implementing your own task system, the remainder of this section
|
||||
discusses the requirements for these calls. You will also likely want to
|
||||
review the example task systems in ``examples/tasksys.cpp`` for reference.
|
||||
If you are not implmenting your own task system, you can skip reading the
|
||||
remainder of this section.
|
||||
|
||||
Here are the declarations of the three functions that must be provided to
|
||||
manage tasks in ``ispc``:
|
||||
|
||||
::
|
||||
|
||||
void ISPCLaunch(void *funcptr, void *data);
|
||||
void ISPCSync();
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
|
||||
void ISPCSync(void *handle);
|
||||
|
||||
On Windows, two additional functions must be provided to dynamically
|
||||
allocate and free memory to store the arguments passed to tasks. (On OSX
|
||||
and Linux, the stack provides memory for task arguments; on Windows, the
|
||||
stack is generally not large enough to do this for large numbers of tasks.)
|
||||
All three of these functions take an opaque handle (or a pointer to an
|
||||
opaque handle) as their first parameter. This handle allows the task
|
||||
system runtime to distinguish between calls to these functions from
|
||||
different functions in ``ispc`` code. In this way, the task system
|
||||
implementation can efficiently wait for completion on just the tasks
|
||||
launched from a single function.
|
||||
|
||||
The first time one of ``ISPCLaunch()`` or ``ISPCAlloc()`` is called in an
|
||||
``ispc`` functon, the ``void *`` pointed to by the ``handlePtr`` parameter
|
||||
will be ``NULL``. The implementations of these function should then
|
||||
initialize ``*handlePtr`` to a unique handle value of some sort. (For
|
||||
example, it might allocate a small structure to record which tasks were
|
||||
launched by the current function.) In subsequent calls to these functions
|
||||
in the emitted ``ispc`` code, the same value for ``handlePtr`` will be
|
||||
passed in, such that loading from ``*handlePtr`` will retrieve the value
|
||||
stored in the first call.
|
||||
|
||||
At function exit (or at an explicit ``sync`` statement), a call to
|
||||
``ISPCSync()`` will be generated if ``*handlePtr`` is non-``NULL``.
|
||||
Therefore, the handle value is passed directly to ``ISPCSync()``, rather
|
||||
than a pointer to it, as in the other functions.
|
||||
|
||||
The ``ISPCAlloc()`` function is used to allocate small blocks of memory to
|
||||
store parameters passed to tasks. It should return a pointer to memory
|
||||
with the given aize and alignment. Note that there is no explicit
|
||||
``ISPCFree()`` call; instead, all memory allocated within an ``ispc``
|
||||
function should be freed when ``ISPCSync()`` is called.
|
||||
|
||||
``ISPCLaunch()`` is called to launch to launch one or more asynchronous
|
||||
tasks. Each ``launch`` statement in ``ispc`` code causes a call to
|
||||
``ISPCLaunch()`` to be emitted in the generated code. The three parameters
|
||||
after the handle pointer to thie function are relatively straightforward;
|
||||
the ``void *f`` parameter holds a pointer to a function to call to run the
|
||||
work for this task, ``data`` holds a pointer to data to pass to this
|
||||
function, and ``count`` is the number of instances of this function to
|
||||
enqueue for asynchronous execution. (In other words, ``count`` corresponds
|
||||
to the value ``n`` in a multiple-task launch statement like ``launch[n]``.)
|
||||
|
||||
The signature of the provided function pointer ``f`` is
|
||||
|
||||
::
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount,
|
||||
int taskIndex, int taskCount)
|
||||
|
||||
These are called by the task launch code generated by the ``ispc``
|
||||
compiler; the first is called to launch to launch a task and the second is
|
||||
called to wait for, respectively. (Factoring them out in this way
|
||||
allows ``ispc`` to inter-operate with the application's task system, if
|
||||
any, rather than having a separate one of its own.) To run a particular
|
||||
task, the task system should cast the function pointer to a ``void (*)(void
|
||||
*, int, int)`` function pointer and then call it with the provided ``void
|
||||
*`` data and then an index for the current hardware thread and the total
|
||||
number of hardware threads the task system has launched--in other words:
|
||||
|
||||
::
|
||||
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
TaskFuncType tft = (TaskFuncType)(funcptr);
|
||||
tft(data, threadIndex, threadCount);
|
||||
|
||||
A number of sample task system implementations are provided with ``ispc``;
|
||||
see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and
|
||||
``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of
|
||||
the ``ispc`` distribution.
|
||||
When this function pointer is called by one of the hardware threads managed
|
||||
bythe task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
|
||||
be passed to it for its first parameter; ``threadCount`` gives the total
|
||||
number of hardware threads that have been spawned to run tasks and
|
||||
``threadIndex`` should be an integer index between zero and ``threadCount``
|
||||
uniquely identifying the hardware thread that is running the task. (These
|
||||
values can be used to index into thread-local storage.)
|
||||
|
||||
The value of ``taskCount`` should be the number of tasks launched in the
|
||||
``launch`` statement that caused the call to ``ISPCLaunch()`` and each of
|
||||
the calls to this function should be given a unique value of ``taskIndex``
|
||||
between zero and ``taskCount``, to distinguish which of the instances
|
||||
of the set of launched tasks is running.
|
||||
|
||||
The ISPC Standard Library
|
||||
=========================
|
||||
@@ -1709,10 +1821,12 @@ the running program instances.
|
||||
|
||||
::
|
||||
|
||||
float broadcast(float value, uniform int index)
|
||||
int8 broadcast(int8 value, uniform int index)
|
||||
int16 broadcast(int16 value, uniform int index)
|
||||
int32 broadcast(int32 value, uniform int index)
|
||||
double broadcast(double value, uniform int index)
|
||||
int64 broadcast(int64 value, uniform int index)
|
||||
float broadcast(float value, uniform int index)
|
||||
double broadcast(double value, uniform int index)
|
||||
|
||||
The ``rotate()`` function allows each program instance to find the value of
|
||||
the given value that their neighbor ``offset`` steps away has. For
|
||||
@@ -1725,10 +1839,12 @@ provided offset value can be positive or negative, and may be greater than
|
||||
|
||||
::
|
||||
|
||||
float rotate(float value, uniform int offset)
|
||||
int8 rotate(int8 value, uniform int offset)
|
||||
int16 rotate(int16 value, uniform int offset)
|
||||
int32 rotate(int32 value, uniform int offset)
|
||||
double rotate(double value, uniform int offset)
|
||||
int64 rotate(int64 value, uniform int offset)
|
||||
float rotate(float value, uniform int offset)
|
||||
double rotate(double value, uniform int offset)
|
||||
|
||||
|
||||
Finally, the ``shuffle()`` functions allow two variants of fully general
|
||||
@@ -1739,10 +1855,12 @@ from which to get the value of ``value``. The provided values for
|
||||
|
||||
::
|
||||
|
||||
float shuffle(float value, int permutation)
|
||||
int8 shuffle(int8 value, int permutation)
|
||||
int16 shuffle(int16 value, int permutation)
|
||||
int32 shuffle(int32 value, int permutation)
|
||||
double shuffle(double value, int permutation)
|
||||
int64 shuffle(int64 value, int permutation)
|
||||
float shuffle(float value, int permutation)
|
||||
double shuffle(double value, int permutation)
|
||||
|
||||
|
||||
The second variant of ``shuffle()`` permutes over the extended vector that
|
||||
@@ -1753,10 +1871,12 @@ of ``value1``, etc.)
|
||||
|
||||
::
|
||||
|
||||
float shuffle(float value0, float value1, int permutation)
|
||||
int8 shuffle(int8 value0, int8 value1, int permutation)
|
||||
int16 shuffle(int16 value0, int16 value1, int permutation)
|
||||
int32 shuffle(int32 value0, int32 value1, int permutation)
|
||||
double shuffle(double value0, double value1, int permutation)
|
||||
int64 shuffle(int64 value0, int64 value1, int permutation)
|
||||
float shuffle(float value0, float value1, int permutation)
|
||||
double shuffle(double value0, double value1, int permutation)
|
||||
|
||||
The various variants of ``popcnt()`` return the population count--the
|
||||
number of bits set in the given value.
|
||||
@@ -1798,6 +1918,71 @@ given value across all of the currently-executing vector lanes.
|
||||
uniform int reduce_max(int a, int b)
|
||||
uniform unsigned int reduce_max(unsigned int a, unsigned int b)
|
||||
|
||||
Finally, you can check to see if a particular value has the same value in
|
||||
all of the currently-running program instances:
|
||||
|
||||
::
|
||||
|
||||
uniform bool reduce_equal(int32 v)
|
||||
uniform bool reduce_equal(unsigned int32 v)
|
||||
uniform bool reduce_equal(float v)
|
||||
uniform bool reduce_equal(int64 v)
|
||||
uniform bool reduce_equal(unsigned int64 v)
|
||||
uniform bool reduce_equal(double)
|
||||
|
||||
There are also variants of these functions that return the value as a
|
||||
``uniform`` in the case where the values are all the same.
|
||||
|
||||
::
|
||||
|
||||
uniform bool reduce_equal(int32 v, reference uniform int32 sameval)
|
||||
uniform bool reduce_equal(unsigned int32 v,
|
||||
reference uniform unsigned int32 sameval)
|
||||
uniform bool reduce_equal(float v, reference uniform float sameval)
|
||||
uniform bool reduce_equal(int64 v, reference uniform int64 sameval)
|
||||
uniform bool reduce_equal(unsigned int64 v,
|
||||
reference uniform unsigned int64 sameval)
|
||||
uniform bool reduce_equal(double, reference uniform double sameval)
|
||||
|
||||
If called when none of the program instances are running,
|
||||
``reduce_equal()`` will return ``false``.
|
||||
|
||||
There are also a number of functions to compute "scan"s of values across
|
||||
the program instances. For example, the ``exclusive_scan_and()`` function
|
||||
computes, for each program instance, the sum of the given value over all of
|
||||
the preceeding program instances. (The scans currently available in
|
||||
``ispc`` are all so-called "exclusive" scans, meaning that the value
|
||||
computed for a given element does not include the value provided for that
|
||||
element.) In C code, an exclusive add scan over an array might be
|
||||
implemented as:
|
||||
|
||||
::
|
||||
|
||||
void scan_add(int *in_array, int *result_array, int count) {
|
||||
result_array[0] = 0;
|
||||
for (int i = 0; i < count; ++i)
|
||||
result_array[i] = result_array[i-1] + in_array[i-1];
|
||||
}
|
||||
|
||||
``ispc`` provides the following scan functions--addition, bitwise-and, and
|
||||
bitwise-or are available:
|
||||
|
||||
::
|
||||
|
||||
int32 exclusive_scan_add(int32 v)
|
||||
unsigned int32 exclusive_scan_add(unsigned int32 v)
|
||||
float exclusive_scan_add(float v)
|
||||
int64 exclusive_scan_add(int64 v)
|
||||
unsigned int64 exclusive_scan_add(unsigned int64 v)
|
||||
double exclusive_scan_add(double v)
|
||||
int32 exclusive_scan_and(int32 v)
|
||||
unsigned int32 exclusive_scan_and(unsigned int32 v)
|
||||
int64 exclusive_scan_and(int64 v)
|
||||
unsigned int64 exclusive_scan_and(unsigned int64 v)
|
||||
int32 exclusive_scan_or(int32 v)
|
||||
unsigned int32 exclusive_scan_or(unsigned int32 v)
|
||||
int64 exclusive_scan_or(int64 v)
|
||||
unsigned int64 exclusive_scan_or(unsigned int64 v)
|
||||
|
||||
|
||||
Packed Load and Store Operations
|
||||
@@ -1861,10 +2046,53 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``
|
||||
|
||||
::
|
||||
|
||||
uniform int8 extract(int8 x, uniform int i)
|
||||
uniform int16 extract(int16 x, uniform int i)
|
||||
uniform int32 extract(int32 x, uniform int i)
|
||||
uniform int64 extract(int64 x, uniform int i)
|
||||
uniform float extract(float x, uniform int i)
|
||||
uniform int extract(int x, uniform int i)
|
||||
|
||||
::
|
||||
|
||||
int8 insert(int8 x, uniform int i, uniform int8 v)
|
||||
int16 insert(int16 x, uniform int i, uniform int16 v)
|
||||
int32 insert(int32 x, uniform int i, uniform int32 v)
|
||||
int64 insert(int64 x, uniform int i, uniform int64 v)
|
||||
float insert(float x, uniform int i, uniform float v)
|
||||
int insert(int x, uniform int i, uniform int v)
|
||||
|
||||
|
||||
Conversions To and From Half-Precision Floats
|
||||
---------------------------------------------
|
||||
|
||||
There are functions to convert to and from the IEEE 16-bit floating-point
|
||||
format. Note that there is no ``half`` data-type, and it isn't possible
|
||||
to do floating-point math directly with ``half`` types in ``ispc``; these
|
||||
functions facilitate converting to and from half-format data in memory.
|
||||
|
||||
To use them, half-format data should be loaded into an ``int16`` and the
|
||||
``half_to_float()`` function used to convert it the a 32-bit floating point
|
||||
value. To store a value to memory in half format, the ``float_to_half()``
|
||||
function returns the 16 bits that are the closest match to the given
|
||||
``float``, in half format.
|
||||
|
||||
::
|
||||
|
||||
float half_to_float(unsigned int16 h)
|
||||
uniform float half_to_float(uniform unsigned int16 h)
|
||||
int16 float_to_half(float f)
|
||||
uniform int16 float_to_half(uniform float f)
|
||||
|
||||
There are also faster versions of these functions that don't worry about
|
||||
handling floating point infinity, "not a number" and denormalized numbers
|
||||
correctly. These are faster than the above functions, but are less
|
||||
precise.
|
||||
|
||||
::
|
||||
|
||||
float half_to_float_fast(unsigned int16 h)
|
||||
uniform float half_to_float_fast(uniform unsigned int16 h)
|
||||
int16 float_to_half_fast(float f)
|
||||
uniform int16 float_to_half_fast(uniform float f)
|
||||
|
||||
|
||||
Atomic Operations and Memory Fences
|
||||
@@ -1886,12 +2114,12 @@ end.)
|
||||
|
||||
One thing to note is that that the value being added to here is a
|
||||
``uniform`` integer, while the increment amount and the return value are
|
||||
``varying``. In other words, the semantics are that each running program
|
||||
instance individually issues the atomic operation with its own ``delta``
|
||||
value and gets the previous value of ``val`` back in return. The atomics
|
||||
for the running program instances may be issued in arbitrary order; it's
|
||||
not guaranteed that they will be issued in ``programIndex`` order, for
|
||||
example.
|
||||
``varying``. In other words, the semantics of this call are that each
|
||||
running program instance individually issues the atomic operation with its
|
||||
own ``delta`` value and gets the previous value of ``val`` back in return.
|
||||
The atomics for the running program instances may be issued in arbitrary
|
||||
order; it's not guaranteed that they will be issued in ``programIndex``
|
||||
order, for example.
|
||||
|
||||
Here are the declarations of the ``int32`` variants of these functions.
|
||||
There are also ``int64`` equivalents as well as variants that take
|
||||
@@ -1909,17 +2137,44 @@ function can be used with ``float`` and ``double`` types as well.)
|
||||
int32 atomic_xor_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_swap_global(reference uniform int32 val, int32 newval)
|
||||
|
||||
There is also an atomic "compare and exchange" function; it atomically
|
||||
compares the value in "val" to "compare"--if they match, it assigns
|
||||
"newval" to "val". In either case, the old value of "val" is returned.
|
||||
(As with the other atomic operations, there are also ``unsigned`` and
|
||||
64-bit variants of this function. Furthermore, there are ``float`` and
|
||||
``double`` variants as well.)
|
||||
There are also variants of these functions that take ``uniform`` values for
|
||||
the operand and return a ``uniform`` result:
|
||||
|
||||
::
|
||||
|
||||
uniform int32 atomic_add_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_subtract_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_min_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_max_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_and_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_or_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_xor_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_swap_global(reference uniform int32 val,
|
||||
uniform int32 newval)
|
||||
|
||||
There are also an atomic swap and "compare and exchange" functions.
|
||||
Compare and exchange atomically compares the value in "val" to
|
||||
"compare"--if they match, it assigns "newval" to "val". In either case,
|
||||
the old value of "val" is returned. (As with the other atomic operations,
|
||||
there are also ``unsigned`` and 64-bit variants of this function.
|
||||
Furthermore, there are ``float`` and ``double`` variants as well.)
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_swap_global(reference uniform int32 val, int32 new)
|
||||
uniform int32 atomic_swap_global(reference uniform int32 val,
|
||||
uniform int32 new)
|
||||
int32 atomic_compare_exchange_global(reference uniform int32 val,
|
||||
int32 compare, int32 newval)
|
||||
uniform int32 atomic_compare_exchange_global(reference uniform int32 val,
|
||||
uniform int32 compare, uniform int32 newval)
|
||||
|
||||
``ispc`` also has a standard library routine that inserts a memory barrier
|
||||
into the code; it ensures that all memory reads and writes prior to be
|
||||
@@ -1935,6 +2190,53 @@ code.
|
||||
void memory_barrier();
|
||||
|
||||
|
||||
Prefetches
|
||||
----------
|
||||
|
||||
The standard library has a variety of functions to prefetch data into the
|
||||
processor's cache. While modern CPUs have automatic prefetchers that do a
|
||||
reasonable job of prefetching data to the cache before its needed, high
|
||||
performance applications may find it helpful to prefetch data before it's
|
||||
needed.
|
||||
|
||||
For example, this code shows how to prefetch data to the processor's L1
|
||||
cache while iterating over the items in an array.
|
||||
|
||||
::
|
||||
|
||||
uniform int32 array[...];
|
||||
for (uniform int i = 0; i < count; ++i) {
|
||||
// do computation with array[i]
|
||||
prefetch_l1(array[i+32]);
|
||||
}
|
||||
|
||||
The standard library has routines to prefetch to the L1, L2, and L3
|
||||
caches. It also has a variant, ``prefetch_nt()``, that indicates that the
|
||||
value being prefetched isn't expected to be used more than once (so should
|
||||
be high priority to be evicted from the cache).
|
||||
|
||||
::
|
||||
|
||||
void prefetch_{l1,l2,l3,nt}(reference TYPE)
|
||||
|
||||
These functions are available for all of the basic types in the
|
||||
language--``int8``, ``int16``, ``int32``, ``float``, and so forth.
|
||||
|
||||
|
||||
System Information
|
||||
------------------
|
||||
|
||||
A routine is available to find the number of CPU cores available in the
|
||||
system:
|
||||
|
||||
::
|
||||
|
||||
int num_cores()
|
||||
|
||||
This value can be useful for adapting the granularity of parallel task
|
||||
decomposition depending on the number of processors in the system.
|
||||
|
||||
|
||||
Low-Level Bits
|
||||
--------------
|
||||
|
||||
@@ -1948,41 +2250,6 @@ value ``true`` (rather than just having the value one). The
|
||||
int sign_extend(bool value)
|
||||
uniform int sign_extend(uniform bool value)
|
||||
|
||||
``ispc`` provides a number of bit/memory-level utility routines in its
|
||||
standard library as well. It has routines that load from and store
|
||||
to 8-bit and 16-bit integer values stored in memory, converting to and from
|
||||
32-bit integers for use in computation in ``ispc`` code. (These functions
|
||||
and this conversion step are necessary because ``ispc`` doesn't have native
|
||||
8-bit or 16-bit types in the language.)
|
||||
|
||||
::
|
||||
|
||||
int load_from_int8(uniform int a[], uniform int offset)
|
||||
unsigned int load_from_int8(uniform unsigned int a[],
|
||||
uniform int offset)
|
||||
void store_to_int8(uniform int a[], uniform int offset,
|
||||
int val)
|
||||
void store_to_int8(uniform unsigned int a[], uniform int offset,
|
||||
unsigned int val)
|
||||
unsigned int load_from_int16(uniform int a[],
|
||||
uniform int offset)
|
||||
unsigned unsigned int load_from_int16(uniform unsigned int a[],
|
||||
uniform int offset)
|
||||
void store_to_int16(uniform int a[], uniform int offset,
|
||||
int val)
|
||||
void store_to_int16(uniform unsigned int a[], uniform int offset,
|
||||
unsigned int val)
|
||||
|
||||
There are three things to note in these functions. First, note that these
|
||||
functions take either ``int`` or ``unsigned int`` arrays as parameters; you
|
||||
need to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side
|
||||
to ``int`` or ``unsigned int`` when passing them to ``ispc`` code. Second,
|
||||
although the arrays are passed as 32-bit integers, in the array indexing
|
||||
calculation, with the ``offset`` parameter, they are treated as if they
|
||||
were ``int8`` or ``int16`` types (i.e. the offset treated as being in terms
|
||||
of number of 8 or 16-bit elements). Third, note that the value of
|
||||
``programIndex`` is implicitly added to offset.
|
||||
|
||||
The ``intbits()`` and ``floatbits()`` functions can be used to implement
|
||||
low-level floating-point bit twiddling. For example, ``intbits()`` returns
|
||||
an ``unsigned int`` that is a bit-for-bit copy of the given ``float``
|
||||
@@ -2077,14 +2344,14 @@ Both the ``foo`` and ``bar`` global variables can be accessed on each
|
||||
side.
|
||||
|
||||
``ispc`` code can also call back to C/C++. On the ``ispc`` side, any
|
||||
application functions to be called must be declared with the ``export "C"``
|
||||
application functions to be called must be declared with the ``extern "C"``
|
||||
qualifier.
|
||||
|
||||
::
|
||||
|
||||
extern "C" void foo(uniform float f, uniform float g);
|
||||
|
||||
Unlike in C++, ``export "C"`` doesn't take braces to delineate
|
||||
Unlike in C++, ``extern "C"`` doesn't take braces to delineate
|
||||
multiple functions to be declared; thus, multiple C functions to be called
|
||||
from ``ispc`` must be declared as follows:
|
||||
|
||||
@@ -2517,6 +2784,15 @@ do a vector load. For example, given:
|
||||
|
||||
A regular vector load is done from array, starting at offset ``2*x``.
|
||||
|
||||
|
||||
8 and 16-bit Integer Types
|
||||
--------------------------
|
||||
|
||||
The code generated for 8 and 16-bit integer types is generally not as
|
||||
efficient as the code generated for 32-bit integer types. It is generally
|
||||
worthwhile to use 32-bit integer types for intermediate computations, even
|
||||
if the final result will be stored in a smaller integer type.
|
||||
|
||||
Low-level Vector Tricks
|
||||
-----------------------
|
||||
|
||||
@@ -2670,6 +2946,123 @@ active upon function entry.
|
||||
ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
|
||||
...
|
||||
|
||||
|
||||
Using Scan Operations For Variable Output
|
||||
-----------------------------------------
|
||||
|
||||
One important application of the ``exclusive_scan_add()`` function in the
|
||||
standard library is when program instances want to generate a variable amount
|
||||
of output and when one would like that output to be densely packed in a
|
||||
single array. For example, consider the code fragment below:
|
||||
|
||||
::
|
||||
|
||||
uniform int func(uniform float outArray[], ...) {
|
||||
int numOut = ...; // figure out how many to be output
|
||||
float outLocal[MAX_OUT]; // staging area
|
||||
// put results in outLocal[0], ..., outLocal[numOut-1]
|
||||
int startOffset = exclusive_scan_add(numOut);
|
||||
for (int i = 0; i < numOut; ++i)
|
||||
outArray[startOffset + i] = outLocal[i];
|
||||
return reduce_add(numOut);
|
||||
}
|
||||
|
||||
Here, each program instance has computed a number, ``numOut``, of values to
|
||||
output, and has stored them in the ``outLocal`` array. Assume that four
|
||||
program instances are running and that the first one wants to output one
|
||||
value, the second two values, and the third and fourth three values each.
|
||||
In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
|
||||
to the four program instances, respectively. The first program instance
|
||||
will write its one result to ``outArray[0]``, the second will write its two
|
||||
values to ``outArray[1]`` and ``outArray[2]``, and so forth. The
|
||||
``reduce_add`` call at the end returns the total number of values that the
|
||||
program instances have written to the array.
|
||||
|
||||
Application-Supplied Execution Masks
|
||||
------------------------------------
|
||||
|
||||
Recall that when execution transitions from the application code to an
|
||||
``ispc`` function, all of the program instances are initially executing.
|
||||
In some cases, it may desired that only some of them are running, based on
|
||||
a data-dependent condition computed in the application program. This
|
||||
situation can easily be handled via an additional parameter from the
|
||||
application.
|
||||
|
||||
As a simple example, consider a case where the application code has an
|
||||
array of ``float`` values and we'd like the ``ispc`` code to update
|
||||
just specific values in that array, where which of those values to be
|
||||
updated has been determined by the application. In C++ code, we might
|
||||
have:
|
||||
|
||||
::
|
||||
|
||||
int count = ...;
|
||||
float *array = new float[count];
|
||||
bool *shouldUpdate = new bool[count];
|
||||
// initialize array and shouldUpdate
|
||||
ispc_func(array, shouldUpdate, count);
|
||||
|
||||
Then, the ``ispc`` code could process this update as:
|
||||
|
||||
::
|
||||
|
||||
export void ispc_func(uniform float array[], uniform bool update[],
|
||||
uniform int count) {
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
cif (update[i+programIndex] == true)
|
||||
// update array[i+programIndex]...
|
||||
}
|
||||
}
|
||||
|
||||
(In this case a "coherent" if statement is likely to be worthwhile if the
|
||||
``update`` array will tend to have sections that are either all-true or
|
||||
all-false.)
|
||||
|
||||
Explicit Vector Programming With Uniform Short Vector Types
|
||||
-----------------------------------------------------------
|
||||
|
||||
The typical model for programming in ``ispc`` is an *implicit* parallel
|
||||
model, where one writes a program that is apparently doing scalar
|
||||
computation on values and the program is then vectorized to run in parallel
|
||||
across the SIMD lanes of a processor. However, ``ispc`` also has some
|
||||
support for explicit vector unit programming, where the vectorization is
|
||||
explicit. Some computations may be more effectively described in the
|
||||
explicit model rather than the implicit model.
|
||||
|
||||
This support is provided via ``uniform`` instances of short vectors
|
||||
(as were introduced in the `Short Vector Types`_ section). Specifically,
|
||||
if this short program
|
||||
|
||||
::
|
||||
|
||||
export uniform float<8> madd(uniform float<8> a,
|
||||
uniform float<8> b, uniform float<8> c) {
|
||||
return a + b * c;
|
||||
}
|
||||
|
||||
is compiled with the AVX target, ``ispc`` generates the following assembly:
|
||||
|
||||
::
|
||||
_madd:
|
||||
vmulps %ymm2, %ymm1, %ymm1
|
||||
vaddps %ymm0, %ymm1, %ymm0
|
||||
ret
|
||||
|
||||
(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
|
||||
``addps`` instructions are generated, and so forth.)
|
||||
|
||||
Note that ``ispc`` doesn't currently support control-flow based on
|
||||
``uniform`` short vector types; it is thus not possible to write code like:
|
||||
|
||||
::
|
||||
|
||||
export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
|
||||
uniform int<8> sum = 0;
|
||||
while (a++ < b)
|
||||
++sum;
|
||||
}
|
||||
|
||||
|
||||
Disclaimer and Legal Information
|
||||
================================
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.0.4
|
||||
PROJECT_NUMBER = 1.0.10
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
@@ -610,7 +610,7 @@ INPUT = builtins.h \
|
||||
util.cpp \
|
||||
parse.yy \
|
||||
lex.ll \
|
||||
stdlib-c.c
|
||||
builtins-c.c
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
|
||||
|
||||
@@ -13,6 +13,7 @@ against regular serial C++ implementations, printing out a comparison of
|
||||
the runtimes and the speedup delivered by ispc. It may be instructive to
|
||||
do a side-by-side diff of the C++ and ispc implementations of these
|
||||
algorithms to learn more about wirting ispc code.
|
||||
|
||||
|
||||
AOBench
|
||||
=======
|
||||
@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
|
||||
(xres x yres) image each time and measuring the computation time with both
|
||||
serial and ispc implementations.
|
||||
|
||||
|
||||
AOBench_Instrumented
|
||||
====================
|
||||
|
||||
@@ -40,12 +42,47 @@ is provided in the instrument.cpp file.
|
||||
*** Note: on Linux, this example currently hits an assertion in LLVM during
|
||||
*** compilation
|
||||
|
||||
|
||||
Deferred
|
||||
========
|
||||
|
||||
This example shows an extensive example of using ispc for efficient
|
||||
deferred shading of scenes with thousands of lights; it's an implementation
|
||||
of the algorithm that Johan Andersson described at SIGGRAPH 2009,
|
||||
implemented by Andrew Lauritzen and Jefferson Montgomery. The basic idea
|
||||
is that a pre-rendered G-buffer is partitioned into tiles, and in each
|
||||
tile, the set of lights that contribute to the tile is first computed.
|
||||
Then, the pixels in the tile are then shaded using just those light
|
||||
sources. (See slides 19-29 of
|
||||
http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
|
||||
for more details on the algorithm.)
|
||||
|
||||
This directory includes three implementations of the algorithm:
|
||||
|
||||
- An ispc implementation that first does a static partitioning of the
|
||||
screen into tiles to parallelize across the CPU cores. Within each tile
|
||||
ispc kernels provide highly efficient implementations of the light
|
||||
culling and shading calculations.
|
||||
- A "best practices" serial C++ implementation. This implementation does a
|
||||
dynamic partitioning of the screen, refining tiles with significant Z
|
||||
depth complexity (these tiles often have a large number of lights that
|
||||
affect them). Within each final tile, the pixels are shaded using
|
||||
regular C++ code.
|
||||
- If the Cilk extensions are available in your compiler, an ispc
|
||||
implementation that uses Cilk will also be built.
|
||||
(See http://software.intel.com/en-us/articles/intel-cilk-plus/). Like
|
||||
the "best practices" serial implementation, this version does dynamic
|
||||
tile partitioning for better load balancing and then uses ispc for the
|
||||
light culling and shading.
|
||||
|
||||
|
||||
Mandelbrot
|
||||
==========
|
||||
|
||||
Mandelbrot set generation. This example is extensively documented at the
|
||||
http://ispc.github.com/example.html page.
|
||||
|
||||
|
||||
Mandelbrot_tasks
|
||||
================
|
||||
|
||||
@@ -57,6 +94,14 @@ Linux, a pthreads-based task system is used (tasks_pthreads.cpp). When
|
||||
using tasks with ispc, no task system is mandated; the user is free to plug
|
||||
in any task system they want, for ease of interoperating with existing task
|
||||
systems.
|
||||
|
||||
|
||||
Noise
|
||||
=====
|
||||
|
||||
This example has an implementation of Ken Perlin's procedural "noise"
|
||||
function, as described in his 2002 "Improving Noise" SIGGRAPH paper.
|
||||
|
||||
|
||||
Options
|
||||
=======
|
||||
@@ -64,6 +109,7 @@ Options
|
||||
This program implements both the Black-Scholes and Binomial options pricing
|
||||
models in both ispc and regular serial C++ code.
|
||||
|
||||
|
||||
RT
|
||||
==
|
||||
|
||||
@@ -80,9 +126,25 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
|
||||
"Physically Based Rendering" book for more about the basic algorithmic
|
||||
details.
|
||||
|
||||
|
||||
Simple
|
||||
======
|
||||
|
||||
This is a simple "hello world" type program that shows a ~10 line
|
||||
application program calling out to a ~5 line ispc program to do a simple
|
||||
computation.
|
||||
|
||||
|
||||
Volume
|
||||
======
|
||||
|
||||
Ray-marching volume rendering, with single scattering lighting model. To
|
||||
run it, specify a camera parameter file and a volume density file, e.g.:
|
||||
|
||||
volume camera.dat density_highres.vol
|
||||
|
||||
(See, e.g. Chapters 11 and 16 of "Physically Based Rendering" for
|
||||
information about the algorithm implemented here.) The volume data set
|
||||
included here was generated by the example implementation of the "Wavelet
|
||||
Turbulence for Fluid Simulation" SIGGRAPH 2008 paper by Kim et
|
||||
al. (http://www.cs.cornell.edu/~tedkim/WTURB/)
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --fast-math --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||
|
||||
default: ao
|
||||
|
||||
@@ -14,12 +20,15 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ ao
|
||||
|
||||
ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
|
||||
ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/ao.o: objs/ao_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
|
||||
@@ -101,6 +101,7 @@ savePPM(const char *fname, int w, int h)
|
||||
fprintf(fp, "255\n");
|
||||
fwrite(img, w * h * 3, 1, fp);
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fname);
|
||||
}
|
||||
|
||||
|
||||
@@ -172,10 +173,30 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
// Report results and save image
|
||||
printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC,
|
||||
width, height);
|
||||
printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n",
|
||||
minTimeISPC, width, height);
|
||||
savePPM("ao-ispc.ppm", width, height);
|
||||
|
||||
//
|
||||
// Run the ispc + tasks path, test_iterations times, and report the
|
||||
// minimum time for any of them.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (unsigned int i = 0; i < test_iterations; i++) {
|
||||
memset((void *)fimg, 0, sizeof(float) * width * height * 3);
|
||||
assert(NSUBSAMPLES == 2);
|
||||
|
||||
reset_and_start_timer();
|
||||
ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
|
||||
double t = get_elapsed_mcycles();
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, t);
|
||||
}
|
||||
|
||||
// Report results and save image
|
||||
printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n",
|
||||
minTimeISPCTasks, width, height);
|
||||
savePPM("ao-ispc-tasks.ppm", width, height);
|
||||
|
||||
//
|
||||
// Run the serial path, again test_iteration times, and report the
|
||||
// minimum time.
|
||||
@@ -192,7 +213,8 @@ int main(int argc, char **argv)
|
||||
// Report more results, save another image...
|
||||
printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial,
|
||||
width, height);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||
savePPM("ao-serial.ppm", width, height);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||
of width w and height h.
|
||||
*/
|
||||
void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
uniform int nsubsamples, reference uniform float image[]) {
|
||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
uniform int h, uniform int nsubsamples,
|
||||
reference uniform float image[]) {
|
||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
// direction we do per iteration and ny the number in y.
|
||||
uniform int nx = 1, ny = 1;
|
||||
|
||||
// FIXME: We actually need ny to be 1 regardless of the decomposition,
|
||||
// since the task decomposition is one scanline high.
|
||||
|
||||
if (programCount == 8) {
|
||||
// Do two pixels at once in the x direction
|
||||
nx = 2;
|
||||
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
++du;
|
||||
}
|
||||
else if (programCount == 16) {
|
||||
// Two at once in both x and y
|
||||
nx = ny = 2;
|
||||
if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
|
||||
nx = 4;
|
||||
ny = 1;
|
||||
if (programIndex >= 4 && programIndex < 8)
|
||||
++du;
|
||||
if (programIndex >= 8)
|
||||
++dv;
|
||||
if (programIndex >= 8 && programIndex < 12)
|
||||
du += 2;
|
||||
if (programIndex >= 12)
|
||||
du += 3;
|
||||
}
|
||||
|
||||
// Now loop over all of the pixels, stepping in x and y as calculated
|
||||
// above. (Assumes that ny divides y and nx divides x...)
|
||||
for (uniform int y = y0; y < y1; y += ny) {
|
||||
for (uniform int x = 0; x < w; x += nx) {
|
||||
// Figur out x,y pixel in NDC
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
|
||||
// offset to the first pixel in the image
|
||||
uniform int offset = 3 * (y * w + x);
|
||||
for (uniform int p = 0; p < programCount; p += 4, ++offset) {
|
||||
for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
|
||||
// Get the four sample values for this pixel
|
||||
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
||||
retArray[p+3];
|
||||
@@ -315,3 +321,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
ao_scanlines(0, h, w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
static void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[]) {
|
||||
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
||||
}
|
||||
|
||||
@@ -140,7 +140,7 @@ ray_plane_intersect(Isect &isect, Ray &ray,
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
if (fabsf(v) < 1.0e-17)
|
||||
if (fabsf(v) < 1.0e-17f)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
@@ -183,11 +183,11 @@ orthoBasis(vec basis[3], const vec &n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
|
||||
if ((n.x < 0.6) && (n.x > -0.6)) {
|
||||
if ((n.x < 0.6f) && (n.x > -0.6f)) {
|
||||
basis[1].x = 1.0;
|
||||
} else if ((n.y < 0.6) && (n.y > -0.6)) {
|
||||
} else if ((n.y < 0.6f) && (n.y > -0.6f)) {
|
||||
basis[1].y = 1.0;
|
||||
} else if ((n.z < 0.6) && (n.z > -0.6)) {
|
||||
} else if ((n.z < 0.6f) && (n.z > -0.6f)) {
|
||||
basis[1].z = 1.0;
|
||||
} else {
|
||||
basis[1].x = 1.0;
|
||||
@@ -224,7 +224,7 @@ ambient_occlusion(Isect &isect, Plane &plane,
|
||||
float phi = 2.0f * M_PI * drand48();
|
||||
float x = cosf(phi) * theta;
|
||||
float y = sinf(phi) * theta;
|
||||
float z = sqrtf(1.0 - theta * theta);
|
||||
float z = sqrtf(1.0f - theta * theta);
|
||||
|
||||
// local . global
|
||||
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
||||
@@ -236,14 +236,14 @@ ambient_occlusion(Isect &isect, Plane &plane,
|
||||
ray.dir.y = ry;
|
||||
ray.dir.z = rz;
|
||||
|
||||
occIsect.t = 1.0e+17;
|
||||
occIsect.t = 1.0e+17f;
|
||||
occIsect.hit = 0;
|
||||
|
||||
for (int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0;
|
||||
if (occIsect.hit) occlusion += 1.f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -280,10 +280,10 @@ static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,
|
||||
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
ray.dir.z = -1.0f;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.t = 1.0e+17f;
|
||||
isect.hit = 0;
|
||||
|
||||
for (int snum = 0; snum < 3; ++snum)
|
||||
|
||||
3
examples/aobench/aobench.vcxproj
Executable file → Normal file
3
examples/aobench/aobench.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -21,6 +21,7 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ao.cpp" />
|
||||
<ClCompile Include="ao_serial.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -g3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
|
||||
ISPCFLAGS=-O2 --instrument --arch=x86-64
|
||||
|
||||
default: ao
|
||||
|
||||
|
||||
@@ -100,6 +100,7 @@ savePPM(const char *fname, int w, int h)
|
||||
fprintf(fp, "255\n");
|
||||
fwrite(img, w * h * 3, 1, fp);
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fname);
|
||||
}
|
||||
|
||||
|
||||
|
||||
0
examples/aobench_instrumented/aobench_instrumented.vcxproj
Executable file → Normal file
0
examples/aobench_instrumented/aobench_instrumented.vcxproj
Executable file → Normal file
@@ -48,19 +48,19 @@ static void __cpuid(int info[4], int infoType) {
|
||||
inline bool CPUSupportsSSE2() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[3] & (1 << 26));
|
||||
return (info[3] & (1 << 26)) != 0;
|
||||
}
|
||||
|
||||
inline bool CPUSupportsSSE4() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 19));
|
||||
return (info[2] & (1 << 19)) != 0;
|
||||
}
|
||||
|
||||
inline bool CPUSupportsAVX() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 28));
|
||||
return (info[2] & (1 << 28)) != 0;
|
||||
}
|
||||
|
||||
#endif // ISPC_CPUID_H
|
||||
|
||||
42
examples/deferred/Makefile
Normal file
42
examples/deferred/Makefile
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasks_pthreads.cpp
|
||||
TASK_LIB=-lpthread
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=../tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast
|
||||
|
||||
OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o
|
||||
|
||||
default: deferred_shading
|
||||
|
||||
.PHONY: dirs clean
|
||||
.PRECIOUS: objs/kernels_ispc.h
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ deferred_shading
|
||||
|
||||
deferred_shading: dirs $(OBJS) $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
209
examples/deferred/common.cpp
Normal file
209
examples/deferred/common.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "../timing.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Framebuffer::Framebuffer(int width, int height) {
|
||||
nPixels = width*height;
|
||||
r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
}
|
||||
|
||||
|
||||
Framebuffer::~Framebuffer() {
|
||||
lAlignedFree(r);
|
||||
lAlignedFree(g);
|
||||
lAlignedFree(b);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Framebuffer::clear() {
|
||||
memset(r, 0, nPixels);
|
||||
memset(g, 0, nPixels);
|
||||
memset(b, 0, nPixels);
|
||||
}
|
||||
|
||||
InputData *
|
||||
CreateInputDataFromFile(const char *path) {
|
||||
FILE *in = fopen(path, "rb");
|
||||
if (!in) return 0;
|
||||
|
||||
InputData *input = new InputData;
|
||||
|
||||
// Load header
|
||||
if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Load data chunk and update pointers
|
||||
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
||||
ALIGNMENT_BYTES);
|
||||
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
input->arrays.zBuffer =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
|
||||
input->arrays.normalEncoded_x =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
|
||||
input->arrays.normalEncoded_y =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
|
||||
input->arrays.specularAmount =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
|
||||
input->arrays.specularPower =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
|
||||
input->arrays.albedo_x =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
|
||||
input->arrays.albedo_y =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
|
||||
input->arrays.albedo_z =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
|
||||
input->arrays.lightPositionView_x =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
|
||||
input->arrays.lightPositionView_y =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
|
||||
input->arrays.lightPositionView_z =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
|
||||
input->arrays.lightAttenuationBegin =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
|
||||
input->arrays.lightColor_x =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
|
||||
input->arrays.lightColor_y =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
|
||||
input->arrays.lightColor_z =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
|
||||
input->arrays.lightAttenuationEnd =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
|
||||
|
||||
fclose(in);
|
||||
return input;
|
||||
}
|
||||
|
||||
|
||||
void DeleteInputData(InputData *input)
|
||||
{
|
||||
lAlignedFree(input->chunk);
|
||||
}
|
||||
|
||||
|
||||
void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer) {
|
||||
// Deswizzle and copy to RGBA output
|
||||
// Doesn't need to be fast... only happens once
|
||||
size_t imageBytes = 3 * input->header.framebufferWidth *
|
||||
input->header.framebufferHeight;
|
||||
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
|
||||
memset(framebufferAOS, 0, imageBytes);
|
||||
|
||||
for (int i = 0; i < input->header.framebufferWidth *
|
||||
input->header.framebufferHeight; ++i) {
|
||||
framebufferAOS[3 * i + 0] = framebuffer.r[i];
|
||||
framebufferAOS[3 * i + 1] = framebuffer.g[i];
|
||||
framebufferAOS[3 * i + 2] = framebuffer.b[i];
|
||||
}
|
||||
|
||||
// Write out simple PPM file
|
||||
FILE *out = fopen(filename, "wb");
|
||||
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
fwrite(framebufferAOS, imageBytes, 1, out);
|
||||
|
||||
lAlignedFree(framebufferAOS);
|
||||
}
|
||||
BIN
examples/deferred/data/pp1280x720.bin
Normal file
BIN
examples/deferred/data/pp1280x720.bin
Normal file
Binary file not shown.
BIN
examples/deferred/data/pp1920x1200.bin
Normal file
BIN
examples/deferred/data/pp1920x1200.bin
Normal file
Binary file not shown.
108
examples/deferred/deferred.h
Normal file
108
examples/deferred/deferred.h
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DEFERRED_H
|
||||
#define DEFERRED_H
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
#define MAX_LIGHTS 1024
|
||||
|
||||
enum InputDataArraysEnum {
|
||||
idaZBuffer = 0,
|
||||
idaNormalEncoded_x,
|
||||
idaNormalEncoded_y,
|
||||
idaSpecularAmount,
|
||||
idaSpecularPower,
|
||||
idaAlbedo_x,
|
||||
idaAlbedo_y,
|
||||
idaAlbedo_z,
|
||||
idaLightPositionView_x,
|
||||
idaLightPositionView_y,
|
||||
idaLightPositionView_z,
|
||||
idaLightAttenuationBegin,
|
||||
idaLightColor_x,
|
||||
idaLightColor_y,
|
||||
idaLightColor_z,
|
||||
idaLightAttenuationEnd,
|
||||
|
||||
idaNum
|
||||
};
|
||||
|
||||
#ifndef ISPC
|
||||
|
||||
#include <stdint.h>
|
||||
#include "kernels_ispc.h"
|
||||
|
||||
#define ALIGNMENT_BYTES 64
|
||||
|
||||
#define MAX_LIGHTS 1024
|
||||
|
||||
#define VISUALIZE_LIGHT_COUNT 0
|
||||
|
||||
struct InputData
|
||||
{
|
||||
ispc::InputHeader header;
|
||||
ispc::InputDataArrays arrays;
|
||||
uint8_t *chunk;
|
||||
};
|
||||
|
||||
|
||||
struct Framebuffer {
|
||||
Framebuffer(int width, int height);
|
||||
~Framebuffer();
|
||||
|
||||
void clear();
|
||||
|
||||
uint8_t *r, *g, *b;
|
||||
|
||||
private:
|
||||
int nPixels;
|
||||
Framebuffer(const Framebuffer &);
|
||||
Framebuffer &operator=(const Framebuffer *);
|
||||
};
|
||||
|
||||
|
||||
InputData *CreateInputDataFromFile(const char *path);
|
||||
void DeleteInputData(InputData *input);
|
||||
void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer);
|
||||
void InitDynamicC(InputData *input);
|
||||
void InitDynamicCilk(InputData *input);
|
||||
void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
|
||||
void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
|
||||
|
||||
#endif // !ISPC
|
||||
|
||||
#endif // DEFERRED_H
|
||||
170
examples/deferred/deferred_shading.vcxproj
Executable file
170
examples/deferred/deferred_shading.vcxproj
Executable file
@@ -0,0 +1,170 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>mandelbrot</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="common.cpp" />
|
||||
<ClCompile Include="dynamic_c.cpp" />
|
||||
<ClCompile Include="dynamic_cilk.cpp" />
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="../tasks_concrt.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="kernels.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
871
examples/deferred/dynamic_c.cpp
Normal file
871
examples/deferred/dynamic_c.cpp
Normal file
@@ -0,0 +1,871 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
|
||||
|
||||
#define DYNAMIC_TREE_LEVELS 5
|
||||
// If this is set to 1 then the result will be identical to the static version
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBounds(int tileStartX, int tileEndX,
|
||||
int tileStartY, int tileEndY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float *minZ, float *maxZ)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (int y = tileStartY; y < tileEndY; ++y) {
|
||||
for (int x = tileStartX; x < tileEndX; ++x) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[(y * gBufferWidth + x)];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = std::min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
*minZ = laneMinZ;
|
||||
*maxZ = laneMaxZ;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
||||
int numTilesX, int numTilesY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float minZArray[],
|
||||
float maxZArray[])
|
||||
{
|
||||
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
float minZ, maxZ;
|
||||
ComputeZBounds(
|
||||
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
&minZ, &maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class MinMaxZTree
|
||||
{
|
||||
public:
|
||||
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||
// Levels must be small enough that neither dimension goes below one tile
|
||||
MinMaxZTree(
|
||||
int tileWidth, int tileHeight, int levels,
|
||||
int gBufferWidth, int gBufferHeight)
|
||||
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
int x = NumTilesX(i);
|
||||
int y = NumTilesY(i);
|
||||
assert(x > 0);
|
||||
assert(y > 0);
|
||||
// NOTE: If the following two asserts fire it probably means that
|
||||
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar)
|
||||
{
|
||||
for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||
ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||
zBuffer, gBufferPitchInElements,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||
}
|
||||
|
||||
// Generate other levels
|
||||
for (int level = 1; level < mLevels; ++level) {
|
||||
int destTilesX = NumTilesX(level);
|
||||
int destTilesY = NumTilesY(level);
|
||||
int srcLevel = level - 1;
|
||||
int srcTilesX = NumTilesX(srcLevel);
|
||||
int srcTilesY = NumTilesY(srcLevel);
|
||||
for (int y = 0; y < destTilesY; ++y) {
|
||||
for (int x = 0; x < destTilesX; ++x) {
|
||||
int srcX = x << 1;
|
||||
int srcY = y << 1;
|
||||
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||
// TODO: SSE branchless min/max is probably better...
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
}
|
||||
}
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
}
|
||||
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MinMaxZTree() {
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
lAlignedFree(mMinZArrays[i]);
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
|
||||
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||
// TODO: Verify this...
|
||||
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||
|
||||
float MinZ(int level, int tileX, int tileY) const {
|
||||
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
float MaxZ(int level, int tileX, int tileY) const {
|
||||
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
|
||||
private:
|
||||
int mTileWidth;
|
||||
int mTileHeight;
|
||||
int mLevels;
|
||||
int mNumTilesX;
|
||||
int mNumTilesY;
|
||||
|
||||
// One array for each "level" in the tree
|
||||
float **mMinZArrays;
|
||||
float **mMaxZArrays;
|
||||
};
|
||||
|
||||
static MinMaxZTree *gMinMaxZTree = 0;
|
||||
|
||||
void InitDynamicC(InputData *input) {
|
||||
gMinMaxZTree =
|
||||
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||
// should be able to handle programCount-sized load/stores.
|
||||
static void
|
||||
SplitTileMinMax(
|
||||
int tileMidX, int tileMidY,
|
||||
// Subtile data (00, 10, 01, 11)
|
||||
float subtileMinZ[],
|
||||
float subtileMaxZ[],
|
||||
// G-buffer data
|
||||
int gBufferWidth, int gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int lightIndices[],
|
||||
int numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
int subtileIndices[],
|
||||
int subtileIndicesPitch,
|
||||
int subtileNumLights[]
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
// Normalize
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
// Initialize
|
||||
int subtileLightOffset[4];
|
||||
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
for (int i = 0; i < numLights; ++i) {
|
||||
int lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
if (fabsf(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||
}
|
||||
if (fabsf(dy) > light_attenuationEnd) {
|
||||
bool positiveY = dy > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||
}
|
||||
|
||||
if (inFrustum[0])
|
||||
subtileIndices[subtileLightOffset[0]++] = lightIndex;
|
||||
if (inFrustum[1])
|
||||
subtileIndices[subtileLightOffset[1]++] = lightIndex;
|
||||
if (inFrustum[2])
|
||||
subtileIndices[subtileLightOffset[2]++] = lightIndex;
|
||||
if (inFrustum[3])
|
||||
subtileIndices[subtileLightOffset[3]++] = lightIndex;
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||
float n = 1.f / sqrtf(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
Unorm8ToFloat32(uint8_t u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline uint8_t
|
||||
Float32ToUnorm8(float f) {
|
||||
return (uint8_t)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline float half_to_float_fast(uint16_t h) {
|
||||
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
||||
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
||||
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
uint32_t xs = ((uint32_t) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
uint32_t xe = (uint32_t) (xes << 23);
|
||||
// Mantissa
|
||||
uint32_t xm = ((uint32_t) hm) << 13;
|
||||
|
||||
uint32_t bits = (xs | xe | xm);
|
||||
float *fp = reinterpret_cast<float *>(&bits);
|
||||
return *fp;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeTileC(
|
||||
int32_t tileStartX, int32_t tileEndX,
|
||||
int32_t tileStartY, int32_t tileEndY,
|
||||
int32_t gBufferWidth, int32_t gBufferHeight,
|
||||
const ispc::InputDataArrays &inputData,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
// Light list
|
||||
int32_t tileLightIndices[],
|
||||
int32_t tileNumLights,
|
||||
// UI
|
||||
bool visualizeLightCount,
|
||||
// Output
|
||||
uint8_t framebuffer_r[],
|
||||
uint8_t framebuffer_g[],
|
||||
uint8_t framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t framebufferIndex = (y * gBufferWidth + x);
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrtf(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
int32_t lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrtf(distanceToLight2);
|
||||
|
||||
float distanceToLightRcp = 1.f / distanceToLight;
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
if (NdotL > 0.0f) {
|
||||
float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = std::min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = std::max(NdotH, 0.0f);
|
||||
|
||||
float specular = powf(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
|
||||
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
|
||||
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ShadeTileC(startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
--level;
|
||||
tileX <<= 1;
|
||||
tileY <<= 1;
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
|
||||
// Work out splitting coords
|
||||
int midX = (tileX + 1) * width;
|
||||
int midY = (tileY + 1) * height;
|
||||
|
||||
// Read subtile min/max data
|
||||
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||
// framebuffer sizes.
|
||||
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
if (rightTileExists) {
|
||||
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||
if (bottomTileExists) {
|
||||
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||
}
|
||||
}
|
||||
if (bottomTileExists) {
|
||||
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||
}
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
// Recurse into subtiles
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
subtileLightIndices[1], subtileNumLights[1],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||
subtileLightIndices[2], subtileNumLights[2],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||
subtileLightIndices[3], subtileNumLights[3],
|
||||
framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
IntersectLightsWithTileMinMax(
|
||||
int tileStartX, int tileEndX,
|
||||
int tileStartY, int tileEndY,
|
||||
// Tile data
|
||||
float minZ,
|
||||
float maxZ,
|
||||
// G-buffer data
|
||||
int gBufferWidth, int gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Output
|
||||
int tileLightIndices[]
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[4];
|
||||
float frustumPlanes_z[4];
|
||||
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
|
||||
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
|
||||
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
|
||||
frustumPlanes_xy_v[i] *= norm;
|
||||
frustumPlanes_z_v[i] *= norm;
|
||||
|
||||
frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
|
||||
frustumPlanes_z[i] = frustumPlanes_z_v[i];
|
||||
}
|
||||
|
||||
int tileNumLights = 0;
|
||||
|
||||
for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
if (!inFrustum)
|
||||
continue;
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
if (inFrustum)
|
||||
tileLightIndices[tileNumLights++] = lightIndex;
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// Get Z min/max for this tile
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int numLights = IntersectLightsWithTileMinMax(
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||
int rootTiles = rootTilesX * rootTilesY;
|
||||
for (int g = 0; g < rootTiles; ++g) {
|
||||
uint32_t tileY = g / rootTilesX;
|
||||
uint32_t tileX = g % rootTilesX;
|
||||
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||
}
|
||||
}
|
||||
398
examples/deferred/dynamic_cilk.cpp
Normal file
398
examples/deferred/dynamic_cilk.cpp
Normal file
@@ -0,0 +1,398 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __cilkplusplus
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
|
||||
|
||||
#define DYNAMIC_TREE_LEVELS 5
|
||||
// If this is set to 1 then the result will be identical to the static version
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
class MinMaxZTreeCilk
|
||||
{
|
||||
public:
|
||||
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||
// Levels must be small enough that neither dimension goes below one tile
|
||||
MinMaxZTreeCilk(
|
||||
int tileWidth, int tileHeight, int levels,
|
||||
int gBufferWidth, int gBufferHeight)
|
||||
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
int x = NumTilesX(i);
|
||||
int y = NumTilesY(i);
|
||||
assert(x > 0);
|
||||
assert(y > 0);
|
||||
// NOTE: If the following two asserts fire it probably means that
|
||||
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar)
|
||||
{
|
||||
// Compute level 0 in parallel. Outer loops is here since we use Cilk
|
||||
_Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||
ispc::ComputeZBoundsRow(tileY,
|
||||
mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||
zBuffer, gBufferPitchInElements,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||
}
|
||||
|
||||
// Generate other levels
|
||||
// NOTE: We currently don't use ispc here since it's sort of an
|
||||
// awkward gather-based reduction Using SSE odd pack/unpack
|
||||
// instructions might actually work here when we need to optimize
|
||||
for (int level = 1; level < mLevels; ++level) {
|
||||
int destTilesX = NumTilesX(level);
|
||||
int destTilesY = NumTilesY(level);
|
||||
int srcLevel = level - 1;
|
||||
int srcTilesX = NumTilesX(srcLevel);
|
||||
int srcTilesY = NumTilesY(srcLevel);
|
||||
_Cilk_for (int y = 0; y < destTilesY; ++y) {
|
||||
for (int x = 0; x < destTilesX; ++x) {
|
||||
int srcX = x << 1;
|
||||
int srcY = y << 1;
|
||||
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||
// TODO: SSE branchless min/max is probably better...
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
}
|
||||
}
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
}
|
||||
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MinMaxZTreeCilk() {
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
lAlignedFree(mMinZArrays[i]);
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
|
||||
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||
// TODO: Verify this...
|
||||
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||
|
||||
float MinZ(int level, int tileX, int tileY) const {
|
||||
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
float MaxZ(int level, int tileX, int tileY) const {
|
||||
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
|
||||
private:
|
||||
int mTileWidth;
|
||||
int mTileHeight;
|
||||
int mLevels;
|
||||
int mNumTilesX;
|
||||
int mNumTilesY;
|
||||
|
||||
// One array for each "level" in the tree
|
||||
float **mMinZArrays;
|
||||
float **mMaxZArrays;
|
||||
};
|
||||
|
||||
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
|
||||
|
||||
void InitDynamicCilk(InputData *input) {
|
||||
gMinMaxZTreeCilk =
|
||||
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ispc::ShadeTile(
|
||||
startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
&input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
--level;
|
||||
tileX <<= 1;
|
||||
tileY <<= 1;
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
|
||||
// Work out splitting coords
|
||||
int midX = (tileX + 1) * width;
|
||||
int midY = (tileY + 1) * height;
|
||||
|
||||
// Read subtile min/max data
|
||||
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||
// framebuffer sizes.
|
||||
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
if (rightTileExists) {
|
||||
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||
if (bottomTileExists) {
|
||||
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||
}
|
||||
}
|
||||
if (bottomTileExists) {
|
||||
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||
}
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
// Recurse into subtiles
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
subtileLightIndices[1], subtileNumLights[1],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||
subtileLightIndices[2], subtileNumLights[2],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||
subtileLightIndices[3], subtileNumLights[3],
|
||||
framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// Get Z min/max for this tile
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int numLights = ispc::IntersectLightsWithTileMinMax(
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
// Launch the "root" tiles. Ideally these should at least fill the
|
||||
// machine... at the moment we have a static number of "levels" to the
|
||||
// mip tree but it might make sense to compute it based on the width of
|
||||
// the machine.
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||
int rootTiles = rootTilesX * rootTilesY;
|
||||
_Cilk_for (int g = 0; g < rootTiles; ++g) {
|
||||
uint32_t tileY = g / rootTilesX;
|
||||
uint32_t tileX = g % rootTilesX;
|
||||
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __cilkplusplus
|
||||
717
examples/deferred/kernels.ispc
Normal file
717
examples/deferred/kernels.ispc
Normal file
@@ -0,0 +1,717 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
|
||||
struct InputDataArrays
|
||||
{
|
||||
uniform float zBuffer[];
|
||||
uniform unsigned int16 normalEncoded_x[]; // half float
|
||||
uniform unsigned int16 normalEncoded_y[]; // half float
|
||||
uniform unsigned int16 specularAmount[]; // half float
|
||||
uniform unsigned int16 specularPower[]; // half float
|
||||
uniform unsigned int8 albedo_x[]; // unorm8
|
||||
uniform unsigned int8 albedo_y[]; // unorm8
|
||||
uniform unsigned int8 albedo_z[]; // unorm8
|
||||
uniform float lightPositionView_x[];
|
||||
uniform float lightPositionView_y[];
|
||||
uniform float lightPositionView_z[];
|
||||
uniform float lightAttenuationBegin[];
|
||||
uniform float lightColor_x[];
|
||||
uniform float lightColor_y[];
|
||||
uniform float lightColor_z[];
|
||||
uniform float lightAttenuationEnd[];
|
||||
};
|
||||
|
||||
struct InputHeader
|
||||
{
|
||||
uniform float cameraProj[4][4];
|
||||
uniform float cameraNear;
|
||||
uniform float cameraFar;
|
||||
|
||||
uniform int32 framebufferWidth;
|
||||
uniform int32 framebufferHeight;
|
||||
uniform int32 numLights;
|
||||
uniform int32 inputDataChunkSize;
|
||||
uniform int32 inputDataArrayOffsets[idaNum];
|
||||
};
|
||||
|
||||
export void foo(reference InputHeader h) { }
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Common utility routines
|
||||
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, reference float ox,
|
||||
reference float oy, reference float oz) {
|
||||
float n = rsqrt(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
Unorm8ToFloat32(unsigned int8 u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline unsigned int8
|
||||
Float32ToUnorm8(float f) {
|
||||
return (unsigned int8)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
static void
|
||||
ComputeZBounds(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
uniform int32 gBufferWidth,
|
||||
// Camera data
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
reference uniform float minZ,
|
||||
reference uniform float maxZ
|
||||
)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[(y * gBufferWidth + x) + programIndex];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
minZ = reduce_min(laneMinZ);
|
||||
maxZ = reduce_max(laneMaxZ);
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
// numLights must currently be a multiple of programCount (SIMD size)
|
||||
export uniform int32
|
||||
IntersectLightsWithTileMinMax(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
// Tile data
|
||||
uniform float minZ,
|
||||
uniform float maxZ,
|
||||
// G-buffer data
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
reference uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
// Parallize across frustum planes.
|
||||
// We really only have four side planes here, but write the code to
|
||||
// handle programCount > 4 robustly
|
||||
uniform float frustumPlanes_xy[programCount];
|
||||
uniform float frustumPlanes_z[programCount];
|
||||
|
||||
// TODO: If programIndex < 4 here? Don't care about masking off the
|
||||
// rest but if interleaving ("x2" modes) the other lanes should ideally
|
||||
// not be emitted...
|
||||
{
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v;
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2, (cameraProj_22 * gBufferScale_y));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
|
||||
|
||||
float frustumPlanes_z_v;
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileEndX - gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 2, tileEndY - gBufferScale_y);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
|
||||
|
||||
// Normalize
|
||||
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||
frustumPlanes_xy_v *= norm;
|
||||
frustumPlanes_z_v *= norm;
|
||||
|
||||
// Save out for uniform use later
|
||||
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||
}
|
||||
|
||||
uniform int32 tileNumLights = 0;
|
||||
|
||||
for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights;
|
||||
baseLightIndex += programCount) {
|
||||
int32 lightIndex = baseLightIndex + programIndex;
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// This seems better than cif(!inFrustum) ccontinue; here since we
|
||||
// don't actually need to mask the rest of this function - this is
|
||||
// just a greedy early-out. Could also structure all of this as
|
||||
// nested if() statements, but this a bit easier to read
|
||||
if (!any(inFrustum))
|
||||
continue;
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
cif (inFrustum) {
|
||||
tileNumLights += packed_store_active(tileLightIndices, tileNumLights,
|
||||
lightIndex);
|
||||
}
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
// numLights must currently be a multiple of programCount (SIMD size)
|
||||
static uniform int32
|
||||
IntersectLightsWithTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Light Data
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
reference uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float minZ, maxZ;
|
||||
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
|
||||
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
|
||||
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
|
||||
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
light_positionView_z_array, light_attenuationEnd_array,
|
||||
tileLightIndices);
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
export void
|
||||
ShadeTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
reference uniform InputDataArrays inputData,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
// Light list
|
||||
reference uniform int32 tileLightIndices[],
|
||||
uniform int32 tileNumLights,
|
||||
// UI
|
||||
uniform bool visualizeLightCount,
|
||||
// Output
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
|
||||
int32 gBufferOffset = gBufferOffsetBase + programIndex;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x + programIndex)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
uniform float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
uniform float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
uniform float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
uniform float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
cif (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrt(distanceToLight2);
|
||||
|
||||
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||
float distanceToLightRcp = rcp(distanceToLight);
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
cif (NdotL > 0.0f) {
|
||||
uniform float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = max(NdotH, 0.0f);
|
||||
|
||||
float specular = pow(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
uniform float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
uniform float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
uniform float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
// These pows are pretty slow right now, but we can do
|
||||
// something faster if really necessary to squeeze every
|
||||
// last bit of performance out of it
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
|
||||
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
|
||||
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Static decomposition
|
||||
|
||||
task void
|
||||
RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
|
||||
reference uniform InputHeader inputHeader,
|
||||
reference uniform InputDataArrays inputData,
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int32 group_y = g / num_groups_x;
|
||||
uniform int32 group_x = g % num_groups_x;
|
||||
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||
|
||||
uniform int sTileNumLights = 0;
|
||||
uniform int sTileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||
|
||||
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
||||
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
||||
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||
uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
|
||||
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||
|
||||
// Light intersection
|
||||
sTileNumLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight,
|
||||
inputData.zBuffer,
|
||||
cameraProj_00, cameraProj_11,
|
||||
cameraProj_22, cameraProj_32,
|
||||
inputHeader.cameraNear, inputHeader.cameraFar,
|
||||
MAX_LIGHTS,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightAttenuationEnd,
|
||||
sTileLightIndices);
|
||||
|
||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight, inputData,
|
||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||
sTileLightIndices, sTileNumLights, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
RenderStatic(reference uniform InputHeader inputHeader,
|
||||
reference uniform InputDataArrays inputData,
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||
uniform int num_groups = num_groups_x * num_groups_y;
|
||||
|
||||
for (uniform int g = 0; g < num_groups; ++g)
|
||||
launch < RenderTile(g, num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Routines for dynamic decomposition path
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
export void
|
||||
ComputeZBoundsRow(
|
||||
uniform int32 tileY,
|
||||
uniform int32 tileWidth, uniform int32 tileHeight,
|
||||
uniform int32 numTilesX, uniform int32 numTilesY,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
uniform int32 gBufferWidth,
|
||||
// Camera data
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
reference uniform float minZArray[],
|
||||
reference uniform float maxZArray[]
|
||||
)
|
||||
{
|
||||
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
uniform float minZ, maxZ;
|
||||
ComputeZBounds(
|
||||
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||
// should be able to handle programCount-sized load/stores.
|
||||
export void
|
||||
SplitTileMinMax(
|
||||
uniform int32 tileMidX, uniform int32 tileMidY,
|
||||
// Subtile data (00, 10, 01, 11)
|
||||
uniform float subtileMinZ[],
|
||||
uniform float subtileMaxZ[],
|
||||
// G-buffer data
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
reference uniform int32 lightIndices[],
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
|
||||
// indexing math ourselves
|
||||
reference uniform int32 subtileIndices[],
|
||||
uniform int32 subtileIndicesPitch,
|
||||
reference uniform int32 subtileNumLights[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
// Parallize across frustum planes
|
||||
// Only have 2 frustum split planes here so may not be worth it, but
|
||||
// we'll do it for now for consistency
|
||||
uniform float frustumPlanes_xy[programCount];
|
||||
uniform float frustumPlanes_z[programCount];
|
||||
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v;
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_22 * gBufferScale_y));
|
||||
|
||||
float frustumPlanes_z_v;
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
|
||||
|
||||
// Normalize
|
||||
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||
frustumPlanes_xy_v *= norm;
|
||||
frustumPlanes_z_v *= norm;
|
||||
|
||||
// Save out for uniform use later
|
||||
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||
|
||||
// Initialize
|
||||
uniform int32 subtileLightOffset[4];
|
||||
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
for (int32 i = programIndex; i < numLights; i += programCount) {
|
||||
// TODO: ISPC says gather required here when it actually
|
||||
// isn't... this could be fixed this by nesting an if() within a
|
||||
// uniform loop, but I'm not totally sure if that's a win
|
||||
// overall. For now we'll just eat the perf cost for cleanliness
|
||||
// since the below are real gathers anyways.
|
||||
int32 lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
cif (abs(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||
}
|
||||
cif (abs(dy) > light_attenuationEnd) {
|
||||
bool positiveY = dy > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||
}
|
||||
|
||||
// Pack and store intersecting lights
|
||||
// TODO: Experiment with a loop here instead
|
||||
cif (inFrustum[0])
|
||||
subtileLightOffset[0] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[0],
|
||||
lightIndex);
|
||||
cif (inFrustum[1])
|
||||
subtileLightOffset[1] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[1],
|
||||
lightIndex);
|
||||
cif (inFrustum[2])
|
||||
subtileLightOffset[2] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[2],
|
||||
lightIndex);
|
||||
cif (inFrustum[3])
|
||||
subtileLightOffset[3] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[3],
|
||||
lightIndex);
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||
}
|
||||
137
examples/deferred/main.cpp
Normal file
137
examples/deferred/main.cpp
Normal file
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include "../timing.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc != 2) {
|
||||
printf("usage: deferred_shading <input_file>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
InputData *input = CreateInputDataFromFile(argv[1]);
|
||||
if (!input) {
|
||||
printf("Failed to load input file \"%s\"!\n", argv[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Framebuffer framebuffer(input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
|
||||
InitDynamicC(input);
|
||||
#ifdef __cilkplusplus
|
||||
InitDynamicCilk(input);
|
||||
#endif // __cilkplusplus
|
||||
|
||||
int nframes = 5;
|
||||
double ispcCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
ispc::RenderStatic(&input->header, &input->arrays,
|
||||
VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer.r, framebuffer.g, framebuffer.b);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
ispcCycles = std::min(ispcCycles, mcycles);
|
||||
}
|
||||
printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
|
||||
"%d x %d image\n", ispcCycles,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||
|
||||
double serialCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicC(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
serialCycles = std::min(serialCycles, mcycles);
|
||||
}
|
||||
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n",
|
||||
serialCycles);
|
||||
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilkplusplus
|
||||
double dynamicCilkCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicCilk(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||
}
|
||||
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n",
|
||||
dynamicCilkCycles);
|
||||
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
||||
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
||||
#else
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
||||
#endif // __cilkplusplus
|
||||
|
||||
DeleteInputData(input);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -15,6 +15,14 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelb
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
@@ -79,6 +87,38 @@ Global
|
||||
{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
|
||||
{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
|
||||
{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
|
||||
{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.Build.0 = Debug|x64
|
||||
{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.Build.0 = Release|Win32
|
||||
{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.ActiveCfg = Release|x64
|
||||
{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.Build.0 = Release|x64
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.Build.0 = Debug|x64
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
||||
@@ -64,6 +64,7 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
fputc(c, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fn);
|
||||
}
|
||||
|
||||
|
||||
|
||||
0
examples/mandelbrot/mandelbrot.vcxproj
Executable file → Normal file
0
examples/mandelbrot/mandelbrot.vcxproj
Executable file → Normal file
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
|
||||
float z_re = c_re, z_im = c_im;
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
if (z_re * z_re + z_im * z_im > 4.)
|
||||
if (z_re * z_re + z_im * z_im > 4.f)
|
||||
break;
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
|
||||
@@ -1,18 +1,12 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=tasks_pthreads.cpp
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
@@ -32,6 +26,9 @@ mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
@@ -64,6 +65,7 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
fputc(c, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fn);
|
||||
}
|
||||
|
||||
|
||||
@@ -98,8 +100,12 @@ ensureTargetISAIsSupported() {
|
||||
}
|
||||
}
|
||||
|
||||
static void usage() {
|
||||
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main() {
|
||||
int main(int argc, char *argv[]) {
|
||||
unsigned int width = 1536;
|
||||
unsigned int height = 1024;
|
||||
float x0 = -2;
|
||||
@@ -107,10 +113,26 @@ int main() {
|
||||
float y0 = -1;
|
||||
float y1 = 1;
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
if (argc == 1)
|
||||
;
|
||||
else if (argc == 2) {
|
||||
if (strncmp(argv[1], "--scale=", 8) == 0) {
|
||||
float scale = atof(argv[1] + 8);
|
||||
if (scale == 0.f)
|
||||
usage();
|
||||
width *= scale;
|
||||
height *= scale;
|
||||
// round up to multiples of 16
|
||||
width = (width + 0xf) & ~0xf;
|
||||
height = (height + 0xf) & ~0xf;
|
||||
}
|
||||
else
|
||||
usage();
|
||||
}
|
||||
else
|
||||
usage();
|
||||
|
||||
extern void TasksInit();
|
||||
TasksInit();
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
int maxIterations = 512;
|
||||
int *buf = new int[width*height];
|
||||
@@ -121,6 +143,9 @@ int main() {
|
||||
//
|
||||
double minISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
reset_and_start_timer();
|
||||
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
@@ -130,9 +155,6 @@ int main() {
|
||||
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
||||
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
||||
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
@@ -140,6 +162,9 @@ int main() {
|
||||
//
|
||||
double minSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
reset_and_start_timer();
|
||||
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
|
||||
@@ -53,11 +53,14 @@ mandel(float c_re, float c_im, int count) {
|
||||
[ystart,yend).
|
||||
*/
|
||||
task void
|
||||
mandelbrot_scanlines(uniform int ystart, uniform int yend,
|
||||
mandelbrot_scanlines(uniform int ybase, uniform int span,
|
||||
uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int maxIterations,
|
||||
reference uniform int output[]) {
|
||||
uniform int ystart = ybase + taskIndex * span;
|
||||
uniform int yend = ystart + span;
|
||||
|
||||
for (uniform int j = ystart; j < yend; ++j) {
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
float x = x0 + (programIndex + i) * dx;
|
||||
@@ -70,6 +73,20 @@ mandelbrot_scanlines(uniform int ystart, uniform int yend,
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
mandelbrot_chunk(uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations, reference uniform int output[]) {
|
||||
uniform int ystart = taskIndex * (height/taskCount);
|
||||
uniform int yend = (taskIndex+1) * (height/taskCount);
|
||||
uniform int span = 1;
|
||||
|
||||
launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
|
||||
width, maxIterations, output) >;
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float x1, uniform float y1,
|
||||
@@ -78,9 +95,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float dx = (x1 - x0) / width;
|
||||
uniform float dy = (y1 - y0) / height;
|
||||
|
||||
/* Launch task to compute results for spans of 'span' scanlines. */
|
||||
uniform int span = 2;
|
||||
for (uniform int j = 0; j < height; j += span)
|
||||
launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
|
||||
maxIterations, output) >;
|
||||
launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
|
||||
maxIterations, output) >;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
|
||||
float z_re = c_re, z_im = c_im;
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
if (z_re * z_re + z_im * z_im > 4.)
|
||||
if (z_re * z_re + z_im * z_im > 4.f)
|
||||
break;
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
|
||||
4
examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
Executable file → Normal file
4
examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -143,7 +143,7 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="mandelbrot.cpp" />
|
||||
<ClCompile Include="mandelbrot_serial.cpp" />
|
||||
<ClCompile Include="tasks_concrt.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* Simple task system implementation for ispc based on Microsoft's
|
||||
Concurrency Runtime. */
|
||||
|
||||
#include <windows.h>
|
||||
#include <concrt.h>
|
||||
using namespace Concurrency;
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *f, void *data);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
}
|
||||
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
|
||||
struct TaskInfo {
|
||||
TaskFuncType ispcFunc;
|
||||
void *ispcData;
|
||||
};
|
||||
|
||||
// This is a simple implementation that just aborts if more than MAX_TASKS
|
||||
// are launched. It could easily be extended to be more general...
|
||||
|
||||
#define MAX_TASKS 4096
|
||||
static int taskOffset;
|
||||
static TaskInfo taskInfo[MAX_TASKS];
|
||||
static event *events[MAX_TASKS];
|
||||
static CRITICAL_SECTION criticalSection;
|
||||
static bool initialized = false;
|
||||
|
||||
void
|
||||
TasksInit() {
|
||||
InitializeCriticalSection(&criticalSection);
|
||||
for (int i = 0; i < MAX_TASKS; ++i)
|
||||
events[i] = new event;
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
||||
void __cdecl
|
||||
lRunTask(LPVOID param) {
|
||||
TaskInfo *ti = (TaskInfo *)param;
|
||||
|
||||
// Actually run the task.
|
||||
// FIXME: like the tasks_gcd.cpp implementation, this is passing bogus
|
||||
// values for the threadIndex and threadCount builtins, which in turn
|
||||
// will cause bugs in code that uses those. FWIW this example doesn't
|
||||
// use them...
|
||||
int threadIndex = 0;
|
||||
int threadCount = 1;
|
||||
ti->ispcFunc(ti->ispcData, threadIndex, threadCount);
|
||||
|
||||
// Signal the event that this task is done
|
||||
int taskNum = ti - &taskInfo[0];
|
||||
events[taskNum]->set();
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ISPCLaunch(void *func, void *data) {
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Get a TaskInfo struct for this task
|
||||
EnterCriticalSection(&criticalSection);
|
||||
TaskInfo *ti = &taskInfo[taskOffset++];
|
||||
assert(taskOffset < MAX_TASKS);
|
||||
LeaveCriticalSection(&criticalSection);
|
||||
|
||||
// And pass it on to the Concurrency Runtime...
|
||||
ti->ispcFunc = (TaskFuncType)func;
|
||||
ti->ispcData = data;
|
||||
CurrentScheduler::ScheduleTask(lRunTask, ti);
|
||||
}
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
event::wait_for_multiple(&events[0], taskOffset, true,
|
||||
COOPERATIVE_TIMEOUT_INFINITE);
|
||||
|
||||
for (int i = 0; i < taskOffset; ++i)
|
||||
events[i]->reset();
|
||||
|
||||
taskOffset = 0;
|
||||
}
|
||||
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
return _aligned_malloc(size, alignment);
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
@@ -1,103 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* A simple task system for ispc programs based on Apple's Grand Central
|
||||
Dispatch. */
|
||||
|
||||
#include <dispatch/dispatch.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static bool initialized = false;
|
||||
static dispatch_queue_t gcdQueue;
|
||||
static dispatch_group_t gcdGroup;
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *f, void *data);
|
||||
void ISPCSync();
|
||||
}
|
||||
|
||||
struct TaskInfo {
|
||||
void *func;
|
||||
void *data;
|
||||
};
|
||||
|
||||
|
||||
void
|
||||
TasksInit() {
|
||||
gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
||||
gcdGroup = dispatch_group_create();
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lRunTask(void *ti) {
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
TaskInfo *taskInfo = (TaskInfo *)ti;
|
||||
|
||||
TaskFuncType func = (TaskFuncType)(taskInfo->func);
|
||||
|
||||
// FIXME: these are bogus values; may cause bugs in code that depends
|
||||
// on them having unique values in different threads.
|
||||
int threadIndex = 0;
|
||||
int threadCount = 1;
|
||||
// Actually run the task
|
||||
func(taskInfo->data, threadIndex, threadCount);
|
||||
|
||||
// FIXME: taskInfo leaks...
|
||||
}
|
||||
|
||||
|
||||
void ISPCLaunch(void *func, void *data) {
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
TaskInfo *ti = new TaskInfo;
|
||||
ti->func = func;
|
||||
ti->data = data;
|
||||
dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
|
||||
}
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Wait for all of the tasks in the group to complete before returning
|
||||
dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
|
||||
}
|
||||
@@ -1,295 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <vector>
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *f, void *data);
|
||||
void ISPCSync();
|
||||
}
|
||||
|
||||
|
||||
static int nThreads;
|
||||
static pthread_t *threads;
|
||||
static pthread_mutex_t taskQueueMutex;
|
||||
static std::vector<std::pair<void *, void *> > taskQueue;
|
||||
static sem_t *workerSemaphore;
|
||||
static uint32_t numUnfinishedTasks;
|
||||
static pthread_mutex_t tasksRunningConditionMutex;
|
||||
static pthread_cond_t tasksRunningCondition;
|
||||
|
||||
static void *lTaskEntry(void *arg);
|
||||
|
||||
/** Figure out how many CPU cores there are in the system
|
||||
*/
|
||||
static int
|
||||
lNumCPUCores() {
|
||||
#if defined(__linux__)
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
#else
|
||||
// Mac
|
||||
int mib[2];
|
||||
mib[0] = CTL_HW;
|
||||
size_t length = 2;
|
||||
if (sysctlnametomib("hw.logicalcpu", mib, &length) == -1) {
|
||||
fprintf(stderr, "sysctlnametomib() filed. Guessing 2 cores.");
|
||||
return 2;
|
||||
}
|
||||
assert(length == 2);
|
||||
|
||||
int nCores = 0;
|
||||
size_t size = sizeof(nCores);
|
||||
|
||||
if (sysctl(mib, 2, &nCores, &size, NULL, 0) == -1) {
|
||||
fprintf(stderr, "sysctl() to find number of cores present failed. Guessing 2.");
|
||||
return 2;
|
||||
}
|
||||
return nCores;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
TasksInit() {
|
||||
nThreads = lNumCPUCores();
|
||||
|
||||
threads = new pthread_t[nThreads];
|
||||
|
||||
int err;
|
||||
if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
|
||||
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
char name[32];
|
||||
sprintf(name, "mandelbrot.%d", (int)getpid());
|
||||
workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
|
||||
if (!workerSemaphore) {
|
||||
fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
|
||||
fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
|
||||
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < nThreads; ++i) {
|
||||
err = pthread_create(&threads[i], NULL, &lTaskEntry, reinterpret_cast<void *>(i));
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ISPCLaunch(void *f, void *d) {
|
||||
if (threads == NULL) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Acquire mutex, add task
|
||||
//
|
||||
int err;
|
||||
if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
taskQueue.push_back(std::make_pair(f, d));
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Update count of number of tasks left to run
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
++numUnfinishedTasks;
|
||||
|
||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Post to the worker semaphore to wake up worker threads that are
|
||||
// sleeping waiting for tasks to show up
|
||||
//
|
||||
if ((err = sem_post(workerSemaphore)) != 0) {
|
||||
fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void *
|
||||
lTaskEntry(void *arg) {
|
||||
int threadIndex = int(reinterpret_cast<int64_t>(arg));
|
||||
int threadCount = nThreads;
|
||||
|
||||
while (true) {
|
||||
int err;
|
||||
if ((err = sem_wait(workerSemaphore)) != 0) {
|
||||
fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::pair<void *, void *> myTask;
|
||||
//
|
||||
// Acquire mutex, get task
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
if (taskQueue.size() == 0) {
|
||||
//
|
||||
// Task queue is empty, go back and wait on the semaphore
|
||||
//
|
||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
myTask = taskQueue.back();
|
||||
taskQueue.pop_back();
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Do work for _myTask_
|
||||
//
|
||||
typedef void (*TaskFunType)(void *, int, int);
|
||||
TaskFunType func = (TaskFunType)myTask.first;
|
||||
func(myTask.second, threadIndex, threadCount);
|
||||
|
||||
//
|
||||
// Decrement the number of unfinished tasks counter
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int unfinished = --numUnfinishedTasks;
|
||||
if (unfinished == 0) {
|
||||
//
|
||||
// Signal the "no more tasks are running" condition if all of
|
||||
// them are done.
|
||||
//
|
||||
int err;
|
||||
if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
pthread_exit(NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
if (threads == NULL) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int err;
|
||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// As long as there are tasks running, wait on the condition variable;
|
||||
// doing so causes this thread to go to sleep until someone signals on
|
||||
// the tasksRunningCondition condition variable.
|
||||
while (numUnfinishedTasks > 0) {
|
||||
if ((err = pthread_cond_wait(&tasksRunningCondition,
|
||||
&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// We acquire ownership of the condition variable mutex when the above
|
||||
// pthread_cond_wait returns.
|
||||
// FIXME: is there a lurking issue here if numUnfinishedTasks gets back
|
||||
// to zero by the time we get to ISPCSync() and thence we're trying to
|
||||
// unlock a mutex we don't have a lock on?
|
||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
3
examples/noise/.gitignore
vendored
Normal file
3
examples/noise/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
noise
|
||||
*.ppm
|
||||
objs
|
||||
26
examples/noise/Makefile
Normal file
26
examples/noise/Makefile
Normal file
@@ -0,0 +1,26 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||
|
||||
default: noise
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ noise
|
||||
|
||||
noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/noise.o: objs/noise_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
150
examples/noise/noise.cpp
Normal file
150
examples/noise/noise.cpp
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "noise_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
extern void noise_serial(float x0, float y0, float x1, float y1,
|
||||
int width, int height, float output[]);
|
||||
|
||||
/* Write a PPM image file with the image */
|
||||
static void
|
||||
writePPM(float *buf, int width, int height, const char *fn) {
|
||||
FILE *fp = fopen(fn, "wb");
|
||||
fprintf(fp, "P6\n");
|
||||
fprintf(fp, "%d %d\n", width, height);
|
||||
fprintf(fp, "255\n");
|
||||
for (int i = 0; i < width*height; ++i) {
|
||||
float v = buf[i] * 255.f;
|
||||
if (v < 0) v = 0;
|
||||
if (v > 255) v = 255;
|
||||
for (int j = 0; j < 3; ++j)
|
||||
fputc((char)v, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 768;
|
||||
float x0 = -10;
|
||||
float x1 = 10;
|
||||
float y0 = -10;
|
||||
float y1 = 10;
|
||||
|
||||
float *buf = new float[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
//
|
||||
double minISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
noise_ispc(x0, y0, x1, y1, width, height, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minISPC = std::min(minISPC, dt);
|
||||
}
|
||||
|
||||
printf("[noise ispc]:\t\t\t[%.3f] million cycles\n", minISPC);
|
||||
writePPM(buf, width, height, "noise-ispc.ppm");
|
||||
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
noise_serial(x0, y0, x1, y1, width, height, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minSerial = std::min(minSerial, dt);
|
||||
}
|
||||
|
||||
printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial);
|
||||
writePPM(buf, width, height, "noise-serial.ppm");
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
|
||||
|
||||
return 0;
|
||||
}
|
||||
164
examples/noise/noise.ispc
Normal file
164
examples/noise/noise.ispc
Normal file
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#define NOISE_PERM_SIZE 256
|
||||
|
||||
static uniform int NoisePerm[2 * NOISE_PERM_SIZE] = {
|
||||
151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140,
|
||||
36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120,
|
||||
234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33,
|
||||
88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168, 68, 175, 74, 165, 71,
|
||||
134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133,
|
||||
230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161,
|
||||
1, 216, 80, 73, 209, 76, 132, 187, 208, 89, 18, 169, 200, 196, 135, 130,
|
||||
116, 188, 159, 86, 164, 100, 109, 198, 173, 186, 3, 64, 52, 217, 226, 250,
|
||||
124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227,
|
||||
47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152, 2, 44,
|
||||
154, 163, 70, 221, 153, 101, 155, 167, 43, 172, 9, 129, 22, 39, 253, 19,
|
||||
98, 108, 110, 79, 113, 224, 232, 178, 185, 112, 104, 218, 246, 97, 228, 251,
|
||||
34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
|
||||
14, 239, 107, 49, 192, 214, 31, 181, 199, 106, 157, 184, 84, 204, 176, 115,
|
||||
121, 50, 45, 127, 4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72,
|
||||
243, 141, 128, 195, 78, 66, 215, 61, 156, 180, 151, 160, 137, 91, 90, 15,
|
||||
131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99,
|
||||
37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252,
|
||||
219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125,
|
||||
136, 171, 168, 68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158,
|
||||
231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245,
|
||||
40, 244, 102, 143, 54, 65, 25, 63, 161, 1, 216, 80, 73, 209, 76, 132, 187,
|
||||
208, 89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109,
|
||||
198, 173, 186, 3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118,
|
||||
126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42,
|
||||
223, 183, 170, 213, 119, 248, 152, 2, 44, 154, 163, 70, 221, 153, 101, 155,
|
||||
167, 43, 172, 9, 129, 22, 39, 253, 19, 98, 108, 110, 79, 113, 224, 232,
|
||||
178, 185, 112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144,
|
||||
12, 191, 179, 162, 241, 81, 51, 145, 235, 249, 14, 239, 107, 49, 192, 214,
|
||||
31, 181, 199, 106, 157, 184, 84, 204, 176, 115, 121, 50, 45, 127, 4, 150,
|
||||
254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78,
|
||||
66, 215, 61, 156, 180
|
||||
};
|
||||
|
||||
|
||||
inline float SmoothStep(float low, float high, float value) {
|
||||
float v = clamp((value - low) / (high - low), 0.f, 1.f);
|
||||
return v * v * (-2.f * v + 3.f);
|
||||
}
|
||||
|
||||
|
||||
inline int Floor2Int(float val) {
|
||||
return (int)floor(val);
|
||||
}
|
||||
|
||||
|
||||
inline float Grad(int x, int y, int z, float dx, float dy, float dz) {
|
||||
int h = NoisePerm[NoisePerm[NoisePerm[x]+y]+z];
|
||||
h &= 15;
|
||||
float u = h<8 || h==12 || h==13 ? dx : dy;
|
||||
float v = h<4 || h==12 || h==13 ? dy : dz;
|
||||
return ((h&1) ? -u : u) + ((h&2) ? -v : v);
|
||||
}
|
||||
|
||||
|
||||
inline float NoiseWeight(float t) {
|
||||
float t3 = t*t*t;
|
||||
float t4 = t3*t;
|
||||
return 6.f*t4*t - 15.f*t4 + 10.f*t3;
|
||||
}
|
||||
|
||||
|
||||
inline float Lerp(float t, float low, float high) {
|
||||
return (1. - t) * low + t * high;
|
||||
}
|
||||
|
||||
|
||||
static float Noise(float x, float y, float z) {
|
||||
// Compute noise cell coordinates and offsets
|
||||
int ix = Floor2Int(x), iy = Floor2Int(y), iz = Floor2Int(z);
|
||||
float dx = x - ix, dy = y - iy, dz = z - iz;
|
||||
|
||||
// Compute gradient weights
|
||||
ix &= (NOISE_PERM_SIZE-1);
|
||||
iy &= (NOISE_PERM_SIZE-1);
|
||||
iz &= (NOISE_PERM_SIZE-1);
|
||||
float w000 = Grad(ix, iy, iz, dx, dy, dz);
|
||||
float w100 = Grad(ix+1, iy, iz, dx-1, dy, dz);
|
||||
float w010 = Grad(ix, iy+1, iz, dx, dy-1, dz);
|
||||
float w110 = Grad(ix+1, iy+1, iz, dx-1, dy-1, dz);
|
||||
float w001 = Grad(ix, iy, iz+1, dx, dy, dz-1);
|
||||
float w101 = Grad(ix+1, iy, iz+1, dx-1, dy, dz-1);
|
||||
float w011 = Grad(ix, iy+1, iz+1, dx, dy-1, dz-1);
|
||||
float w111 = Grad(ix+1, iy+1, iz+1, dx-1, dy-1, dz-1);
|
||||
|
||||
// Compute trilinear interpolation of weights
|
||||
float wx = NoiseWeight(dx), wy = NoiseWeight(dy), wz = NoiseWeight(dz);
|
||||
float x00 = Lerp(wx, w000, w100);
|
||||
float x10 = Lerp(wx, w010, w110);
|
||||
float x01 = Lerp(wx, w001, w101);
|
||||
float x11 = Lerp(wx, w011, w111);
|
||||
float y0 = Lerp(wy, x00, x10);
|
||||
float y1 = Lerp(wy, x01, x11);
|
||||
return Lerp(wz, y0, y1);
|
||||
}
|
||||
|
||||
|
||||
static float Turbulence(float x, float y, float z, uniform int octaves) {
|
||||
float omega = 0.6;
|
||||
|
||||
float sum = 0., lambda = 1., o = 1.;
|
||||
for (uniform int i = 0; i < octaves; ++i) {
|
||||
sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
|
||||
lambda *= 1.99f;
|
||||
o *= omega;
|
||||
}
|
||||
return sum * 0.5;
|
||||
}
|
||||
|
||||
|
||||
export void noise_ispc(uniform float x0, uniform float y0, uniform float x1,
|
||||
uniform float y1, uniform int width, uniform int height,
|
||||
uniform float output[])
|
||||
{
|
||||
uniform float dx = (x1 - x0) / width;
|
||||
uniform float dy = (y1 - y0) / height;
|
||||
|
||||
for (uniform int j = 0; j < height; j++) {
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
float x = x0 + (i + programIndex) * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = (j * width + i + programIndex);
|
||||
output[index] = Turbulence(x, y, 0.6, 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
167
examples/noise/noise.vcxproj
Normal file
167
examples/noise/noise.vcxproj
Normal file
@@ -0,0 +1,167 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>noise</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="noise.cpp" />
|
||||
<ClCompile Include="noise_serial.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="noise.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
170
examples/noise/noise_serial.cpp
Normal file
170
examples/noise/noise_serial.cpp
Normal file
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#define NOISE_PERM_SIZE 256
|
||||
|
||||
static int NoisePerm[2 * NOISE_PERM_SIZE] = {
|
||||
151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140,
|
||||
36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120,
|
||||
234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33,
|
||||
88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168, 68, 175, 74, 165, 71,
|
||||
134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133,
|
||||
230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161,
|
||||
1, 216, 80, 73, 209, 76, 132, 187, 208, 89, 18, 169, 200, 196, 135, 130,
|
||||
116, 188, 159, 86, 164, 100, 109, 198, 173, 186, 3, 64, 52, 217, 226, 250,
|
||||
124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227,
|
||||
47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152, 2, 44,
|
||||
154, 163, 70, 221, 153, 101, 155, 167, 43, 172, 9, 129, 22, 39, 253, 19,
|
||||
98, 108, 110, 79, 113, 224, 232, 178, 185, 112, 104, 218, 246, 97, 228, 251,
|
||||
34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
|
||||
14, 239, 107, 49, 192, 214, 31, 181, 199, 106, 157, 184, 84, 204, 176, 115,
|
||||
121, 50, 45, 127, 4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72,
|
||||
243, 141, 128, 195, 78, 66, 215, 61, 156, 180, 151, 160, 137, 91, 90, 15,
|
||||
131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99,
|
||||
37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252,
|
||||
219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125,
|
||||
136, 171, 168, 68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158,
|
||||
231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245,
|
||||
40, 244, 102, 143, 54, 65, 25, 63, 161, 1, 216, 80, 73, 209, 76, 132, 187,
|
||||
208, 89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109,
|
||||
198, 173, 186, 3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118,
|
||||
126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42,
|
||||
223, 183, 170, 213, 119, 248, 152, 2, 44, 154, 163, 70, 221, 153, 101, 155,
|
||||
167, 43, 172, 9, 129, 22, 39, 253, 19, 98, 108, 110, 79, 113, 224, 232,
|
||||
178, 185, 112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144,
|
||||
12, 191, 179, 162, 241, 81, 51, 145, 235, 249, 14, 239, 107, 49, 192, 214,
|
||||
31, 181, 199, 106, 157, 184, 84, 204, 176, 115, 121, 50, 45, 127, 4, 150,
|
||||
254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78,
|
||||
66, 215, 61, 156, 180
|
||||
};
|
||||
|
||||
|
||||
inline float Clamp(float v, float low, float high) {
|
||||
return v < low ? low : ((v > high) ? high : v);
|
||||
}
|
||||
|
||||
|
||||
inline float SmoothStep(float low, float high, float value) {
|
||||
float v = Clamp((value - low) / (high - low), 0.f, 1.f);
|
||||
return v * v * (-2.f * v + 3.f);
|
||||
}
|
||||
|
||||
|
||||
inline int Floor2Int(float val) {
|
||||
return (int)floorf(val);
|
||||
}
|
||||
|
||||
|
||||
inline float Grad(int x, int y, int z, float dx, float dy, float dz) {
|
||||
int h = NoisePerm[NoisePerm[NoisePerm[x]+y]+z];
|
||||
h &= 15;
|
||||
float u = h<8 || h==12 || h==13 ? dx : dy;
|
||||
float v = h<4 || h==12 || h==13 ? dy : dz;
|
||||
return ((h&1) ? -u : u) + ((h&2) ? -v : v);
|
||||
}
|
||||
|
||||
|
||||
inline float NoiseWeight(float t) {
|
||||
float t3 = t*t*t;
|
||||
float t4 = t3*t;
|
||||
return 6.f*t4*t - 15.f*t4 + 10.f*t3;
|
||||
}
|
||||
|
||||
|
||||
inline float Lerp(float t, float low, float high) {
|
||||
return (1.f - t) * low + t * high;
|
||||
}
|
||||
|
||||
|
||||
static float Noise(float x, float y, float z) {
|
||||
// Compute noise cell coordinates and offsets
|
||||
int ix = Floor2Int(x), iy = Floor2Int(y), iz = Floor2Int(z);
|
||||
float dx = x - ix, dy = y - iy, dz = z - iz;
|
||||
|
||||
// Compute gradient weights
|
||||
ix &= (NOISE_PERM_SIZE-1);
|
||||
iy &= (NOISE_PERM_SIZE-1);
|
||||
iz &= (NOISE_PERM_SIZE-1);
|
||||
float w000 = Grad(ix, iy, iz, dx, dy, dz);
|
||||
float w100 = Grad(ix+1, iy, iz, dx-1, dy, dz);
|
||||
float w010 = Grad(ix, iy+1, iz, dx, dy-1, dz);
|
||||
float w110 = Grad(ix+1, iy+1, iz, dx-1, dy-1, dz);
|
||||
float w001 = Grad(ix, iy, iz+1, dx, dy, dz-1);
|
||||
float w101 = Grad(ix+1, iy, iz+1, dx-1, dy, dz-1);
|
||||
float w011 = Grad(ix, iy+1, iz+1, dx, dy-1, dz-1);
|
||||
float w111 = Grad(ix+1, iy+1, iz+1, dx-1, dy-1, dz-1);
|
||||
|
||||
// Compute trilinear interpolation of weights
|
||||
float wx = NoiseWeight(dx), wy = NoiseWeight(dy), wz = NoiseWeight(dz);
|
||||
float x00 = Lerp(wx, w000, w100);
|
||||
float x10 = Lerp(wx, w010, w110);
|
||||
float x01 = Lerp(wx, w001, w101);
|
||||
float x11 = Lerp(wx, w011, w111);
|
||||
float y0 = Lerp(wy, x00, x10);
|
||||
float y1 = Lerp(wy, x01, x11);
|
||||
return Lerp(wz, y0, y1);
|
||||
}
|
||||
|
||||
|
||||
static float Turbulence(float x, float y, float z, int octaves) {
|
||||
float omega = 0.6;
|
||||
|
||||
float sum = 0., lambda = 1., o = 1.;
|
||||
for (int i = 0; i < octaves; ++i) {
|
||||
sum += fabsf(o * Noise(lambda * x, lambda * y, lambda * z));
|
||||
lambda *= 1.99f;
|
||||
o *= omega;
|
||||
}
|
||||
return sum * 0.5f;
|
||||
}
|
||||
|
||||
|
||||
void noise_serial(float x0, float y0, float x1, float y1,
|
||||
int width, int height, float output[])
|
||||
{
|
||||
float dx = (x1 - x0) / width;
|
||||
float dy = (y1 - y0) / height;
|
||||
|
||||
for (int j = 0; j < height; j++) {
|
||||
for (int i = 0; i < width; ++i) {
|
||||
float x = x0 + i * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = (j * width + i);
|
||||
output[index] = Turbulence(x, y, 0.6f, 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
0
examples/options/options.vcxproj
Executable file → Normal file
0
examples/options/options.vcxproj
Executable file → Normal file
@@ -47,7 +47,7 @@ static inline float
|
||||
CND(float X) {
|
||||
float L = fabsf(X);
|
||||
|
||||
float k = 1.0 / (1.0 + 0.2316419 * L);
|
||||
float k = 1.f / (1.f + 0.2316419f * L);
|
||||
float k2 = k*k;
|
||||
float k3 = k2*k;
|
||||
float k4 = k2*k2;
|
||||
@@ -59,7 +59,7 @@ CND(float X) {
|
||||
w *= invSqrt2Pi * expf(-L * L * .5f);
|
||||
|
||||
if (X > 0.f)
|
||||
w = 1.0 - w;
|
||||
w = 1.f - w;
|
||||
return w;
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
||||
|
||||
float dt = T / BINOMIAL_NUM;
|
||||
float u = expf(v * sqrtf(dt));
|
||||
float d = 1. / u;
|
||||
float d = 1.f / u;
|
||||
float disc = expf(r * dt);
|
||||
float Pu = (disc - d) / (u - d);
|
||||
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
@@ -14,11 +20,16 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ rt
|
||||
|
||||
rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o -lm
|
||||
rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp objs/rt_ispc.h
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/rt.o: objs/rt_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
#include <math.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
@@ -51,7 +52,8 @@ using namespace ispc;
|
||||
|
||||
typedef unsigned int uint;
|
||||
|
||||
extern void raytrace_serial(int width, int height, const float raster2camera[4][4],
|
||||
extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4], float image[],
|
||||
int id[], const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]);
|
||||
@@ -90,6 +92,7 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
printf("Wrote image file %s\n", filename);
|
||||
}
|
||||
|
||||
|
||||
@@ -125,11 +128,28 @@ ensureTargetISAIsSupported() {
|
||||
}
|
||||
|
||||
|
||||
static void usage() {
|
||||
fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "usage: rt <filename base>\n");
|
||||
exit(1);
|
||||
float scale = 1.f;
|
||||
const char *filename = NULL;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strncmp(argv[i], "--scale=", 8) == 0) {
|
||||
scale = atof(argv[i] + 8);
|
||||
if (scale == 0.f)
|
||||
usage();
|
||||
}
|
||||
else if (filename != NULL)
|
||||
usage();
|
||||
else
|
||||
filename = argv[i];
|
||||
}
|
||||
if (filename == NULL)
|
||||
usage();
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
@@ -143,10 +163,10 @@ int main(int argc, char *argv[]) {
|
||||
// Read the camera specification information from the camera file
|
||||
//
|
||||
char fnbuf[1024];
|
||||
sprintf(fnbuf, "%s.camera", argv[1]);
|
||||
sprintf(fnbuf, "%s.camera", filename);
|
||||
FILE *f = fopen(fnbuf, "rb");
|
||||
if (!f) {
|
||||
perror(argv[1]);
|
||||
perror(fnbuf);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -154,20 +174,20 @@ int main(int argc, char *argv[]) {
|
||||
// Nothing fancy, and trouble if we run on a big-endian system, just
|
||||
// fread in the bits
|
||||
//
|
||||
int width, height;
|
||||
int baseWidth, baseHeight;
|
||||
float camera2world[4][4], raster2camera[4][4];
|
||||
READ(width, 1);
|
||||
READ(height, 1);
|
||||
READ(baseWidth, 1);
|
||||
READ(baseHeight, 1);
|
||||
READ(camera2world[0][0], 16);
|
||||
READ(raster2camera[0][0], 16);
|
||||
|
||||
//
|
||||
// Read in the serialized BVH
|
||||
//
|
||||
sprintf(fnbuf, "%s.bvh", argv[1]);
|
||||
sprintf(fnbuf, "%s.bvh", filename);
|
||||
f = fopen(fnbuf, "rb");
|
||||
if (!f) {
|
||||
perror(argv[2]);
|
||||
perror(fnbuf);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -190,7 +210,9 @@ int main(int argc, char *argv[]) {
|
||||
nodes[i].bounds[1].v[1] = b[4];
|
||||
nodes[i].bounds[1].v[2] = b[5];
|
||||
READ(nodes[i].offset, 1);
|
||||
READ(nodes[i].primsAxis, 1);
|
||||
READ(nodes[i].nPrimitives, 1);
|
||||
READ(nodes[i].splitAxis, 1);
|
||||
READ(nodes[i].pad, 1);
|
||||
}
|
||||
|
||||
// And then read the triangles
|
||||
@@ -212,10 +234,10 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
// round image resolution up to multiple of 4 to makethings easy for
|
||||
// round image resolution up to multiple of 16 to make things easy for
|
||||
// the code that assigns pixels to ispc program instances
|
||||
height = (height + 3) & ~3;
|
||||
width = (width + 3) & ~3;
|
||||
int height = (int(baseHeight * scale) + 0xf) & ~0xf;
|
||||
int width = (int(baseWidth * scale) + 0xf) & ~0xf;
|
||||
|
||||
// allocate images; one to hold hit object ids, one to hold depth to
|
||||
// the first interseciton
|
||||
@@ -223,19 +245,42 @@ int main(int argc, char *argv[]) {
|
||||
float *image = new float[width*height];
|
||||
|
||||
//
|
||||
// Run 3 iterations with ispc, record the minimum time
|
||||
// Run 3 iterations with ispc + 1 core, record the minimum time
|
||||
//
|
||||
double minTimeISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
raytrace(width, height, raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera,
|
||||
camera2world, image, id, nodes, triangles);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPC = std::min(dt, minTimeISPC);
|
||||
}
|
||||
printf("[rt ispc]:\t\t\t[%.3f] million cycles for %d x %d image\n", minTimeISPC, width, height);
|
||||
printf("[rt ispc, 1 core]:\t\t[%.3f] million cycles for %d x %d image\n",
|
||||
minTimeISPC, width, height);
|
||||
|
||||
writeImage(id, image, width, height, "rt-ispc.ppm");
|
||||
writeImage(id, image, width, height, "rt-ispc-1core.ppm");
|
||||
|
||||
memset(id, 0, width*height*sizeof(int));
|
||||
memset(image, 0, width*height*sizeof(float));
|
||||
|
||||
//
|
||||
// Run 3 iterations with ispc + 1 core, record the minimum time
|
||||
//
|
||||
double minTimeISPCtasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
|
||||
camera2world, image, id, nodes, triangles);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
|
||||
}
|
||||
printf("[rt ispc + tasks]:\t\t[%.3f] million cycles for %d x %d image\n",
|
||||
minTimeISPCtasks, width, height);
|
||||
|
||||
writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
|
||||
|
||||
memset(id, 0, width*height*sizeof(int));
|
||||
memset(image, 0, width*height*sizeof(float));
|
||||
|
||||
//
|
||||
// And 3 iterations with the serial implementation, reporting the
|
||||
@@ -244,14 +289,15 @@ int main(int argc, char *argv[]) {
|
||||
double minTimeSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
raytrace_serial(width, height, raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
raytrace_serial(width, height, baseWidth, baseHeight, raster2camera,
|
||||
camera2world, image, id, nodes, triangles);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeSerial = std::min(dt, minTimeSerial);
|
||||
}
|
||||
printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n",
|
||||
minTimeSerial, width, height);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCtasks);
|
||||
|
||||
writeImage(id, image, width, height, "rt-serial.ppm");
|
||||
|
||||
|
||||
@@ -50,21 +50,11 @@ struct Triangle {
|
||||
struct LinearBVHNode {
|
||||
uniform float3 bounds[2];
|
||||
uniform unsigned int offset; // num primitives for leaf, second child for interior
|
||||
uniform unsigned int primsAxis; // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
|
||||
uniform unsigned int8 nPrimitives;
|
||||
uniform unsigned int8 splitAxis;
|
||||
uniform unsigned int16 pad;
|
||||
};
|
||||
|
||||
static inline uniform int nPrims(const reference LinearBVHNode node) {
|
||||
return (node.primsAxis & 0xff);
|
||||
}
|
||||
|
||||
static inline uniform int axis(const reference LinearBVHNode node) {
|
||||
return ((node.primsAxis >> 8) & 0xff);
|
||||
}
|
||||
|
||||
static inline uniform bool isInterior(const reference LinearBVHNode node) {
|
||||
return nPrims(node) == 0;
|
||||
}
|
||||
|
||||
static inline float3 Cross(const float3 v1, const float3 v2) {
|
||||
float v1x = v1.x, v1y = v1.y, v1z = v1.z;
|
||||
float v2x = v2.x, v2y = v2.y, v2z = v2.z;
|
||||
@@ -199,7 +189,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
// Check ray against BVH node
|
||||
LinearBVHNode node = nodes[nodeNum];
|
||||
if (any(BBoxIntersect(node.bounds, ray))) {
|
||||
uniform unsigned int nPrimitives = nPrims(node);
|
||||
uniform unsigned int nPrimitives = node.nPrimitives;
|
||||
if (nPrimitives > 0) {
|
||||
// Intersect ray with primitives in leaf BVH node
|
||||
uniform unsigned int primitivesOffset = node.offset;
|
||||
@@ -213,7 +203,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
}
|
||||
else {
|
||||
// Put far BVH node on _todo_ stack, advance to near node
|
||||
if (r.dirIsNeg[axis(node)]) {
|
||||
if (r.dirIsNeg[node.splitAxis]) {
|
||||
todo[todoOffset++] = nodeNum + 1;
|
||||
nodeNum = node.offset;
|
||||
}
|
||||
@@ -236,20 +226,26 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
}
|
||||
|
||||
|
||||
export void raytrace(uniform int width, uniform int height,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
static void raytrace_tile(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
uniform float widthScale = (float)(baseWidth) / (float)(width);
|
||||
uniform float heightScale = (float)(baseHeight) / (float)(height);
|
||||
|
||||
static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||
static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
|
||||
// The outer loops are always over blocks of 4x4 pixels
|
||||
for (uniform int y = 0; y < height; y += 4) {
|
||||
for (uniform int x = 0; x < width; x += 4) {
|
||||
for (uniform int y = y0; y < y1; y += 4) {
|
||||
for (uniform int x = x0; x < x1; x += 4) {
|
||||
// Now we have a block of 4x4=16 pixels to process; it will
|
||||
// take 16/programCount iterations of this loop to process
|
||||
// them.
|
||||
@@ -261,7 +257,8 @@ export void raytrace(uniform int width, uniform int height,
|
||||
const float dy = udy[o * programCount + programIndex];
|
||||
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
|
||||
generateRay(raster2camera, camera2world, (x+dx)*widthScale,
|
||||
(y+dy)*heightScale, ray);
|
||||
BVHIntersect(nodes, triangles, ray);
|
||||
|
||||
int offset = (y + (int)dy) * width + (x + (int)dx);
|
||||
@@ -271,3 +268,54 @@ export void raytrace(uniform int width, uniform int height,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void raytrace_ispc(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
}
|
||||
|
||||
|
||||
task void raytrace_tile_task(uniform int y0, uniform int y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
uniform int dx = 16; // must match dx below
|
||||
uniform int xTasks = (width + (dx-1)) / dx;
|
||||
uniform int x0 = (taskIndex % xTasks) * dx;
|
||||
uniform int x1 = x0 + dx;
|
||||
x1 = min(x1, width);
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
}
|
||||
|
||||
|
||||
export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
uniform int dx = 16, dy = 16;
|
||||
uniform int nTasks = (width + (dx-1)) / dx;
|
||||
for (uniform int y = 0; y < height; y += dy) {
|
||||
uniform int y1 = min(y + dy, height);
|
||||
launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth,
|
||||
baseHeight, raster2camera, camera2world,
|
||||
image, id, nodes, triangles) >;
|
||||
}
|
||||
}
|
||||
|
||||
3
examples/rt/rt.vcxproj
Executable file → Normal file
3
examples/rt/rt.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -164,6 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<ItemGroup>
|
||||
<ClCompile Include="rt.cpp" />
|
||||
<ClCompile Include="rt_serial.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
|
||||
// Just enough of a float3 class to do what we need in this file.
|
||||
#ifdef _MSC_VER
|
||||
@@ -75,30 +76,20 @@ struct Ray {
|
||||
namespace ispc {
|
||||
struct Triangle {
|
||||
float3 p[3];
|
||||
int id;
|
||||
int32_t id;
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
float3 bounds[2];
|
||||
unsigned int offset; // primitives for leaf, second child for interior
|
||||
unsigned int primsAxis; // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
|
||||
int32_t offset; // primitives for leaf, second child for interior
|
||||
uint8_t nPrimitives;
|
||||
uint8_t splitAxis;
|
||||
uint16_t pad;
|
||||
};
|
||||
}
|
||||
|
||||
using namespace ispc;
|
||||
|
||||
inline int nPrims(const LinearBVHNode &node) {
|
||||
return (node.primsAxis & 0xff);
|
||||
}
|
||||
|
||||
inline int axis(const LinearBVHNode &node) {
|
||||
return ((node.primsAxis >> 8) & 0xff);
|
||||
}
|
||||
|
||||
inline bool isInterior(const LinearBVHNode &node) {
|
||||
return nPrims(node) == 0;
|
||||
}
|
||||
|
||||
inline float3 Cross(const float3 &v1, const float3 &v2) {
|
||||
float v1x = v1.x, v1y = v1.y, v1z = v1.z;
|
||||
float v2x = v2.x, v2y = v2.y, v2z = v2.z;
|
||||
@@ -230,7 +221,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
// Check ray against BVH node
|
||||
const LinearBVHNode &node = nodes[nodeNum];
|
||||
if (BBoxIntersect(node.bounds, ray)) {
|
||||
unsigned int nPrimitives = nPrims(node);
|
||||
unsigned int nPrimitives = node.nPrimitives;
|
||||
if (nPrimitives > 0) {
|
||||
// Intersect ray with primitives in leaf BVH node
|
||||
unsigned int primitivesOffset = node.offset;
|
||||
@@ -244,7 +235,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
}
|
||||
else {
|
||||
// Put far BVH node on _todo_ stack, advance to near node
|
||||
if (r.dirIsNeg[axis(node)]) {
|
||||
if (r.dirIsNeg[node.splitAxis]) {
|
||||
todo[todoOffset++] = nodeNum + 1;
|
||||
nodeNum = node.offset;
|
||||
}
|
||||
@@ -267,17 +258,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
}
|
||||
|
||||
|
||||
void raytrace_serial(int width, int height,
|
||||
void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[],
|
||||
int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
float widthScale = float(baseWidth) / float(width);
|
||||
float heightScale = float(baseHeight) / float(height);
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x, y, ray);
|
||||
generateRay(raster2camera, camera2world, x * widthScale,
|
||||
y * heightScale, ray);
|
||||
BVHIntersect(nodes, triangles, ray);
|
||||
|
||||
int offset = y * width + x;
|
||||
|
||||
6
examples/simple/simple.vcxproj
Executable file → Normal file
6
examples/simple/simple.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -28,7 +28,7 @@
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
@@ -161,4 +161,4 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
2
examples/stencil/.gitignore
vendored
Normal file
2
examples/stencil/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
stencil
|
||||
objs
|
||||
35
examples/stencil/Makefile
Normal file
35
examples/stencil/Makefile
Normal file
@@ -0,0 +1,35 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: stencil
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ stencil
|
||||
|
||||
stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/stencil.o: objs/stencil_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
186
examples/stencil/stencil.cpp
Normal file
186
examples/stencil/stencil.cpp
Normal file
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "stencil_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||
int y0, int y1, int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const float coef[5],
|
||||
const float vsq[],
|
||||
float Aeven[], float Aodd[]);
|
||||
|
||||
|
||||
void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
A[0][offset] = (x < Nx / 2) ? x / float(Nx) : y / float(Ny);
|
||||
A[1][offset] = 0;
|
||||
vsq[offset] = x*y*z / float(Nx * Ny * Nz);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
int Nx = 256, Ny = 256, Nz = 256;
|
||||
int width = 4;
|
||||
float *Aserial[2], *Aispc[2];
|
||||
Aserial[0] = new float [Nx * Ny * Nz];
|
||||
Aserial[1] = new float [Nx * Ny * Nz];
|
||||
Aispc[0] = new float [Nx * Ny * Nz];
|
||||
Aispc[1] = new float [Nx * Ny * Nz];
|
||||
float *vsq = new float [Nx * Ny * Nz];
|
||||
|
||||
float coeff[4] = { 0.5, -.25, .125, -.0625 };
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation on one core; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPC = std::min(minTimeISPC, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation with tasks; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minTimeSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aserial[0], Aserial[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeSerial = std::min(minTimeSerial, dt);
|
||||
}
|
||||
|
||||
printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||
|
||||
// Check for agreement
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
float error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
||||
Aserial[1][offset]);
|
||||
if (error > 1e-4)
|
||||
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
|
||||
x, y, z, Aispc[1][offset], Aserial[1][offset]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
129
examples/stencil/stencil.ispc
Normal file
129
examples/stencil/stencil.ispc
Normal file
@@ -0,0 +1,129 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
static void
|
||||
stencil_step(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const float coef[4], uniform const float vsq[],
|
||||
uniform const float Ain[], uniform float Aout[]) {
|
||||
const uniform int Nxy = Nx * Ny;
|
||||
|
||||
for (uniform int z = z0; z < z1; ++z) {
|
||||
for (uniform int y = y0; y < y1; ++y) {
|
||||
// Assumes that (x1-x0) % programCount == 0
|
||||
for (uniform int x = x0; x < x1; x += programCount) {
|
||||
int index = (z * Nxy) + (y * Nx) + x + programIndex;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
float div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static task void
|
||||
stencil_step_task(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const float coef[4], uniform const float vsq[],
|
||||
uniform const float Ain[], uniform float Aout[]) {
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const float coef[4],
|
||||
uniform const float vsq[],
|
||||
uniform float Aeven[], uniform float Aodd[])
|
||||
{
|
||||
for (uniform int t = t0; t < t1; ++t) {
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of "dz" in the z extent of the volume. (dz=1 seems to work
|
||||
// better than any larger values.)
|
||||
uniform int dz = 1;
|
||||
for (uniform int z = z0; z < z1; z += dz) {
|
||||
if ((t & 1) == 0)
|
||||
launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd) >;
|
||||
else
|
||||
launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven) >;
|
||||
}
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration.
|
||||
sync;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const float coef[4],
|
||||
uniform const float vsq[],
|
||||
uniform float Aeven[], uniform float Aodd[])
|
||||
{
|
||||
for (uniform int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
172
examples/stencil/stencil.vcxproj
Normal file
172
examples/stencil/stencil.vcxproj
Normal file
@@ -0,0 +1,172 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>rt</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stencil.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="stencil.cpp" />
|
||||
<ClCompile Include="stencil_serial.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
86
examples/stencil/stencil_serial.cpp
Normal file
86
examples/stencil/stencil_serial.cpp
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
static void
|
||||
stencil_step(int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const float coef[4], const float vsq[],
|
||||
const float Ain[], float Aout[]) {
|
||||
int Nxy = Nx * Ny;
|
||||
|
||||
for (int z = z0; z < z1; ++z) {
|
||||
for (int y = y0; y < y1; ++y) {
|
||||
for (int x = x0; x < x1; ++x) {
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
float div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void loop_stencil_serial(int t0, int t1,
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const float coef[4],
|
||||
const float vsq[],
|
||||
float Aeven[], float Aodd[])
|
||||
{
|
||||
for (int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
868
examples/tasksys.cpp
Normal file
868
examples/tasksys.cpp
Normal file
@@ -0,0 +1,868 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
This file implements simple task systems that provide the three
|
||||
entrypoints used by ispc-generated to code to handle 'launch' and 'sync'
|
||||
statements in ispc programs. See the section "Task Parallelism: Language
|
||||
Syntax" in the ispc documentation for information about using task
|
||||
parallelism in ispc programs, and see the section "Task Parallelism:
|
||||
Runtime Requirements" for information about the task-related entrypoints
|
||||
that are implemented here.
|
||||
|
||||
There are three task systems in this file: one built using Microsoft's
|
||||
Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
|
||||
one built on top of bare pthreads.
|
||||
*/
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#define ISPC_USE_CONCRT
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#define ISPC_USE_PTHREADS
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
// pthreads is noticably more efficient than GCD on OSX
|
||||
#define ISPC_USE_PTHREADS
|
||||
//#define ISPC_USE_GCD
|
||||
#endif
|
||||
|
||||
#define DBG(x)
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
#ifdef ISPC_USE_CONCRT
|
||||
#include <concrt.h>
|
||||
using namespace Concurrency;
|
||||
#endif // ISPC_USE_CONCRT
|
||||
#ifdef ISPC_USE_GCD
|
||||
#include <dispatch/dispatch.h>
|
||||
#include <pthread.h>
|
||||
#endif // ISPC_USE_GCD
|
||||
#ifdef ISPC_USE_PTHREADS
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
|
||||
// Signature of ispc-generated 'task' functions
|
||||
typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
|
||||
int taskIndex, int taskCount);
|
||||
|
||||
// Small structure used to hold the data for each task
|
||||
struct TaskInfo {
|
||||
TaskFuncType func;
|
||||
void *data;
|
||||
int taskIndex, taskCount;
|
||||
#if defined(ISPC_IS_WINDOWS)
|
||||
event taskEvent;
|
||||
#endif
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// TaskGroupBase
|
||||
|
||||
#define LOG_TASK_QUEUE_CHUNK_SIZE 12
|
||||
#define MAX_TASK_QUEUE_CHUNKS 8
|
||||
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
#define NUM_MEM_BUFFERS 16
|
||||
|
||||
class TaskGroup;
|
||||
|
||||
/** The TaskGroupBase structure provides common functionality for "task
|
||||
groups"; a task group is the set of tasks launched from within a single
|
||||
ispc function. When the function is ready to return, it waits for all
|
||||
of the tasks in its task group to finish before it actually returns.
|
||||
*/
|
||||
class TaskGroupBase {
|
||||
public:
|
||||
void Reset();
|
||||
|
||||
int AllocTaskInfo(int count);
|
||||
TaskInfo *GetTaskInfo(int index);
|
||||
|
||||
void *AllocMemory(int64_t size, int32_t alignment);
|
||||
|
||||
protected:
|
||||
TaskGroupBase();
|
||||
~TaskGroupBase();
|
||||
|
||||
int nextTaskInfoIndex;
|
||||
|
||||
private:
|
||||
/* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
|
||||
needed by the calling function. We hold up to MAX_TASK_QUEUE_CHUNKS
|
||||
of these (and then exit at runtime if more than this many tasks are
|
||||
launched.)
|
||||
*/
|
||||
TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
|
||||
|
||||
/* We also allocate chunks of memory to service ISPCAlloc() calls. The
|
||||
memBuffers[] array holds pointers to this memory. The first element
|
||||
of this array is initialized to point to mem and then any subsequent
|
||||
elements required are initialized with dynamic allocation.
|
||||
*/
|
||||
int curMemBuffer, curMemBufferOffset;
|
||||
int memBufferSize[NUM_MEM_BUFFERS];
|
||||
char *memBuffers[NUM_MEM_BUFFERS];
|
||||
char mem[256];
|
||||
|
||||
};
|
||||
|
||||
|
||||
inline TaskGroupBase::TaskGroupBase() {
|
||||
nextTaskInfoIndex = 0;
|
||||
|
||||
curMemBuffer = 0;
|
||||
curMemBufferOffset = 0;
|
||||
memBuffers[0] = mem;
|
||||
memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
|
||||
for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
|
||||
memBuffers[i] = NULL;
|
||||
memBufferSize[i] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
|
||||
taskInfo[i] = NULL;
|
||||
}
|
||||
|
||||
|
||||
inline TaskGroupBase::~TaskGroupBase() {
|
||||
// Note: don't delete memBuffers[0], since it points to the start of
|
||||
// the "mem" member!
|
||||
for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
|
||||
delete[] memBuffers[i];
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroupBase::Reset() {
|
||||
nextTaskInfoIndex = 0;
|
||||
curMemBuffer = 0;
|
||||
curMemBufferOffset = 0;
|
||||
}
|
||||
|
||||
|
||||
inline int
|
||||
TaskGroupBase::AllocTaskInfo(int count) {
|
||||
int ret = nextTaskInfoIndex;
|
||||
nextTaskInfoIndex += count;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
inline TaskInfo *
|
||||
TaskGroupBase::GetTaskInfo(int index) {
|
||||
int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
||||
int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
|
||||
|
||||
if (chunk == MAX_TASK_QUEUE_CHUNKS) {
|
||||
fprintf(stderr, "A total of %d tasks have been launched from the "
|
||||
"current function--the simple built-in task system can handle "
|
||||
"no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
|
||||
"and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation. "
|
||||
"Sorry! Exiting.\n", index);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (taskInfo[chunk] == NULL)
|
||||
taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
|
||||
return &taskInfo[chunk][offset];
|
||||
}
|
||||
|
||||
|
||||
inline void *
|
||||
TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
|
||||
char *basePtr = memBuffers[curMemBuffer];
|
||||
int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
|
||||
iptr = (iptr + (alignment-1)) & ~(alignment-1);
|
||||
|
||||
int newOffset = int(iptr + size - (int64_t)basePtr);
|
||||
if (newOffset < memBufferSize[curMemBuffer]) {
|
||||
curMemBufferOffset = newOffset;
|
||||
return (char *)iptr;
|
||||
}
|
||||
|
||||
++curMemBuffer;
|
||||
curMemBufferOffset = 0;
|
||||
assert(curMemBuffer < NUM_MEM_BUFFERS);
|
||||
|
||||
int allocSize = 1 << (12 + curMemBuffer);
|
||||
allocSize = std::max(int(size+alignment), allocSize);
|
||||
char *newBuf = new char[allocSize];
|
||||
memBufferSize[curMemBuffer] = allocSize;
|
||||
memBuffers[curMemBuffer] = newBuf;
|
||||
return AllocMemory(size, alignment);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Atomics and the like
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static inline void
|
||||
lMemFence() {
|
||||
__asm__ __volatile__("mfence":::"memory");
|
||||
}
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
|
||||
|
||||
#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
|
||||
#define ISPC_POINTER_BYTES 4
|
||||
#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
|
||||
#define ISPC_POINTER_BYTES 8
|
||||
#else
|
||||
#error "Pointer size unknown!"
|
||||
#endif // __SIZEOF_POINTER__
|
||||
|
||||
|
||||
static void *
|
||||
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedCompareExchangePointer(v, newValue, oldValue);
|
||||
#else
|
||||
void *result;
|
||||
#if (ISPC_POINTER_BYTES == 4)
|
||||
__asm__ __volatile__("lock\ncmpxchgd %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
#else
|
||||
__asm__ __volatile__("lock\ncmpxchgq %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
#endif // ISPC_POINTER_BYTES
|
||||
lMemFence();
|
||||
return result;
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static int32_t
|
||||
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
|
||||
int32_t result;
|
||||
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
lMemFence();
|
||||
return result;
|
||||
}
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef ISPC_USE_CONCRT
|
||||
// With ConcRT, we don't need to extend TaskGroupBase at all.
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
};
|
||||
#endif // ISPC_USE_CONCRT
|
||||
|
||||
#ifdef ISPC_USE_GCD
|
||||
/* With Grand Central Dispatch, we associate a GCD dispatch group with each
|
||||
task group. (We'll later wait on this dispatch group when we need to
|
||||
wait on all of the tasks in the group to finish.)
|
||||
*/
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
TaskGroup() {
|
||||
gcdGroup = dispatch_group_create();
|
||||
}
|
||||
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
|
||||
private:
|
||||
dispatch_group_t gcdGroup;
|
||||
};
|
||||
#endif // ISPC_USE_GCD
|
||||
|
||||
#ifdef ISPC_USE_PTHREADS
|
||||
static void *lTaskEntry(void *arg);
|
||||
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
TaskGroup() {
|
||||
numUnfinishedTasks = 0;
|
||||
waitingTasks.reserve(128);
|
||||
inActiveList = false;
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
TaskGroupBase::Reset();
|
||||
numUnfinishedTasks = 0;
|
||||
assert(inActiveList == false);
|
||||
lMemFence();
|
||||
}
|
||||
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
|
||||
private:
|
||||
friend void *lTaskEntry(void *arg);
|
||||
|
||||
int32_t numUnfinishedTasks;
|
||||
int32_t pad[3];
|
||||
std::vector<int> waitingTasks;
|
||||
bool inActiveList;
|
||||
};
|
||||
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Grand Central Dispatch
|
||||
|
||||
#ifdef ISPC_USE_GCD
|
||||
|
||||
/* A simple task system for ispc programs based on Apple's Grand Central
|
||||
Dispatch. */
|
||||
|
||||
static dispatch_queue_t gcdQueue;
|
||||
static volatile int32_t lock = 0;
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
if (gcdQueue != NULL)
|
||||
return;
|
||||
|
||||
while (1) {
|
||||
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
||||
if (gcdQueue == NULL) {
|
||||
gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
||||
assert(gcdQueue != NULL);
|
||||
lMemFence();
|
||||
}
|
||||
lock = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lRunTask(void *ti) {
|
||||
TaskInfo *taskInfo = (TaskInfo *)ti;
|
||||
// FIXME: these are bogus values; may cause bugs in code that depends
|
||||
// on them having unique values in different threads.
|
||||
int threadIndex = 0;
|
||||
int threadCount = 1;
|
||||
|
||||
// Actually run the task
|
||||
taskInfo->func(taskInfo->data, threadIndex, threadCount,
|
||||
taskInfo->taskIndex, taskInfo->taskCount);
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||
dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_GCD
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Concurrency Runtime
|
||||
|
||||
#ifdef ISPC_USE_CONCRT
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
// No initialization needed
|
||||
}
|
||||
|
||||
|
||||
static void __cdecl
|
||||
lRunTask(LPVOID param) {
|
||||
TaskInfo *ti = (TaskInfo *)param;
|
||||
|
||||
// Actually run the task.
|
||||
// FIXME: like the GCD implementation for OS X, this is passing bogus
|
||||
// values for the threadIndex and threadCount builtins, which in turn
|
||||
// will cause bugs in code that uses those.
|
||||
int threadIndex = 0;
|
||||
int threadCount = 1;
|
||||
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
|
||||
|
||||
// Signal the event that this task is done
|
||||
ti->taskEvent.set();
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
for (int i = 0; i < count; ++i)
|
||||
CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
for (int i = 0; i < nextTaskInfoIndex; ++i) {
|
||||
TaskInfo *ti = GetTaskInfo(i);
|
||||
ti->taskEvent.wait();
|
||||
ti->taskEvent.reset();
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_CONCRT
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// pthreads
|
||||
|
||||
#ifdef ISPC_USE_PTHREADS
|
||||
|
||||
static volatile int32_t lock = 0;
|
||||
|
||||
static int nThreads;
|
||||
static pthread_t *threads = NULL;
|
||||
|
||||
static pthread_mutex_t taskSysMutex;
|
||||
static std::vector<TaskGroup *> activeTaskGroups;
|
||||
static sem_t *workerSemaphore;
|
||||
|
||||
|
||||
static inline int32_t
|
||||
lAtomicAdd(int32_t *v, int32_t delta) {
|
||||
int32_t origValue;
|
||||
__asm__ __volatile__("lock\n"
|
||||
"xaddl %0,%1"
|
||||
: "=r"(origValue), "=m"(*v) : "0"(delta)
|
||||
: "memory");
|
||||
return origValue;
|
||||
}
|
||||
|
||||
|
||||
static void *
|
||||
lTaskEntry(void *arg) {
|
||||
int threadIndex = (int)((int64_t)arg);
|
||||
int threadCount = nThreads;
|
||||
|
||||
while (1) {
|
||||
int err;
|
||||
//
|
||||
// Wait on the semaphore until we're woken up due to the arrival of
|
||||
// more work.
|
||||
//
|
||||
if ((err = sem_wait(workerSemaphore)) != 0) {
|
||||
fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Acquire the mutex
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (activeTaskGroups.size() == 0) {
|
||||
//
|
||||
// Task queue is empty, go back and wait on the semaphore
|
||||
//
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
//
|
||||
// Get the last task group on the active list and the last task
|
||||
// from its waiting tasks list.
|
||||
//
|
||||
TaskGroup *tg = activeTaskGroups.back();
|
||||
assert(tg->waitingTasks.size() > 0);
|
||||
int taskNumber = tg->waitingTasks.back();
|
||||
tg->waitingTasks.pop_back();
|
||||
|
||||
if (tg->waitingTasks.size() == 0) {
|
||||
// We just took the last task from this task group, so remove
|
||||
// it from the active list.
|
||||
activeTaskGroups.pop_back();
|
||||
tg->inActiveList = false;
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// And now actually run the task
|
||||
//
|
||||
DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
|
||||
TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
|
||||
myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
|
||||
myTask->taskCount);
|
||||
|
||||
//
|
||||
// Decrement the "number of unfinished tasks" counter in the task
|
||||
// group.
|
||||
//
|
||||
lMemFence();
|
||||
lAtomicAdd(&tg->numUnfinishedTasks, -1);
|
||||
}
|
||||
|
||||
pthread_exit(NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
if (threads == NULL) {
|
||||
while (1) {
|
||||
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
||||
if (threads == NULL) {
|
||||
// We launch one fewer thread than there are cores,
|
||||
// since the main thread here will also grab jobs from
|
||||
// the task queue itself.
|
||||
nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
|
||||
|
||||
int err;
|
||||
if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {
|
||||
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
char name[32];
|
||||
sprintf(name, "ispc_task.%d", (int)getpid());
|
||||
workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
|
||||
if (!workerSemaphore) {
|
||||
fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
|
||||
for (int i = 0; i < nThreads; ++i) {
|
||||
err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
activeTaskGroups.reserve(64);
|
||||
}
|
||||
|
||||
// Make sure all of the above goes to memory before we
|
||||
// clear the lock.
|
||||
lMemFence();
|
||||
lock = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseCoord, int count) {
|
||||
//
|
||||
// Acquire mutex, add task
|
||||
//
|
||||
int err;
|
||||
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Add the corresponding set of tasks to the waiting-to-be-run list for
|
||||
// this task group.
|
||||
//
|
||||
// FIXME: it's a little ugly to hold a global mutex for this when we
|
||||
// only need to make sure no one else is accessing this task group's
|
||||
// waitingTasks list. (But a small experiment in switching to a
|
||||
// per-TaskGroup mutex showed worse performance!)
|
||||
for (int i = 0; i < count; ++i)
|
||||
waitingTasks.push_back(baseCoord + i);
|
||||
|
||||
// Add the task group to the global active list if it isn't there
|
||||
// already.
|
||||
if (inActiveList == false) {
|
||||
activeTaskGroups.push_back(this);
|
||||
inActiveList = true;
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Update the count of the number of tasks left to run in this task
|
||||
// group.
|
||||
//
|
||||
lMemFence();
|
||||
lAtomicAdd(&numUnfinishedTasks, count);
|
||||
|
||||
//
|
||||
// Post to the worker semaphore to wake up worker threads that are
|
||||
// sleeping waiting for tasks to show up
|
||||
//
|
||||
for (int i = 0; i < count; ++i)
|
||||
if ((err = sem_post(workerSemaphore)) != 0) {
|
||||
fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks));
|
||||
|
||||
while (numUnfinishedTasks > 0) {
|
||||
// All of the tasks in this group aren't finished yet. We'll try
|
||||
// to help out here since we don't have anything else to do...
|
||||
|
||||
DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg,
|
||||
numUnfinishedTasks));
|
||||
|
||||
//
|
||||
// Acquire the global task system mutex to grab a task to work on
|
||||
//
|
||||
int err;
|
||||
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
TaskInfo *myTask = NULL;
|
||||
TaskGroup *runtg = this;
|
||||
if (waitingTasks.size() > 0) {
|
||||
int taskNumber = waitingTasks.back();
|
||||
waitingTasks.pop_back();
|
||||
|
||||
if (waitingTasks.size() == 0) {
|
||||
// There's nothing left to start running from this group,
|
||||
// so remove it from the active task list.
|
||||
activeTaskGroups.erase(std::find(activeTaskGroups.begin(),
|
||||
activeTaskGroups.end(), this));
|
||||
inActiveList = false;
|
||||
}
|
||||
myTask = GetTaskInfo(taskNumber);
|
||||
DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg));
|
||||
}
|
||||
else {
|
||||
// Other threads are already working on all of the tasks in
|
||||
// this group, so we can't help out by running one ourself.
|
||||
// We'll try to run one from another group to make ourselves
|
||||
// useful here.
|
||||
if (activeTaskGroups.size() == 0) {
|
||||
// No active task groups left--there's nothing for us to do.
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
// FIXME: We basically end up busy-waiting here, which is
|
||||
// extra wasteful in a world with hyperthreading. It would
|
||||
// be much better to put this thread to sleep on a
|
||||
// condition variable that was signaled when the last task
|
||||
// in this group was finished.
|
||||
sleep(0);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get a task to run from another task group.
|
||||
runtg = activeTaskGroups.back();
|
||||
assert(runtg->waitingTasks.size() > 0);
|
||||
|
||||
int taskNumber = runtg->waitingTasks.back();
|
||||
runtg->waitingTasks.pop_back();
|
||||
if (runtg->waitingTasks.size() == 0) {
|
||||
// There's left to start running from this group, so remove
|
||||
// it from the active task list.
|
||||
activeTaskGroups.pop_back();
|
||||
runtg->inActiveList = false;
|
||||
}
|
||||
myTask = runtg->GetTaskInfo(taskNumber);
|
||||
DBG(fprintf(stderr, "running task %d from other group %p in sync\n",
|
||||
taskNumber, runtg));
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Do work for _myTask_
|
||||
//
|
||||
// FIXME: bogus values for thread index/thread count here as well..
|
||||
myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
|
||||
|
||||
//
|
||||
// Decrement the number of unfinished tasks counter
|
||||
//
|
||||
lMemFence();
|
||||
lAtomicAdd(&runtg->numUnfinishedTasks, -1);
|
||||
}
|
||||
DBG(fprintf(stderr, "sync for %p done!n", tg));
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define MAX_FREE_TASK_GROUPS 64
|
||||
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
|
||||
|
||||
static inline TaskGroup *
|
||||
AllocTaskGroup() {
|
||||
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||
TaskGroup *tg = freeTaskGroups[i];
|
||||
if (tg != NULL) {
|
||||
void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
|
||||
if (ptr != NULL) {
|
||||
assert(ptr == tg);
|
||||
return (TaskGroup *)ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new TaskGroup;
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
FreeTaskGroup(TaskGroup *tg) {
|
||||
tg->Reset();
|
||||
|
||||
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||
if (freeTaskGroups[i] == NULL) {
|
||||
void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
|
||||
if (ptr == NULL)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
delete tg;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
void ISPCSync(void *handle);
|
||||
}
|
||||
|
||||
void
|
||||
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
|
||||
TaskGroup *taskGroup;
|
||||
if (*taskGroupPtr == NULL) {
|
||||
InitTaskSystem();
|
||||
taskGroup = AllocTaskGroup();
|
||||
*taskGroupPtr = taskGroup;
|
||||
}
|
||||
else
|
||||
taskGroup = (TaskGroup *)(*taskGroupPtr);
|
||||
|
||||
int baseIndex = taskGroup->AllocTaskInfo(count);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
|
||||
ti->func = (TaskFuncType)func;
|
||||
ti->data = data;
|
||||
ti->taskIndex = i;
|
||||
ti->taskCount = count;
|
||||
}
|
||||
taskGroup->Launch(baseIndex, count);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ISPCSync(void *h) {
|
||||
TaskGroup *taskGroup = (TaskGroup *)h;
|
||||
if (taskGroup != NULL) {
|
||||
taskGroup->Sync();
|
||||
FreeTaskGroup(taskGroup);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void *
|
||||
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
|
||||
TaskGroup *taskGroup;
|
||||
if (*taskGroupPtr == NULL) {
|
||||
InitTaskSystem();
|
||||
taskGroup = AllocTaskGroup();
|
||||
*taskGroupPtr = taskGroup;
|
||||
}
|
||||
else
|
||||
taskGroup = (TaskGroup *)(*taskGroupPtr);
|
||||
|
||||
return taskGroup->AllocMemory(size, alignment);
|
||||
}
|
||||
@@ -38,7 +38,9 @@
|
||||
#include <windows.h>
|
||||
#define rdtsc __rdtsc
|
||||
#else
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
__inline__ uint64_t rdtsc() {
|
||||
uint32_t low, high;
|
||||
__asm__ __volatile__ (
|
||||
@@ -48,7 +50,9 @@ extern "C" {
|
||||
"rdtsc" : "=a" (low), "=d" (high));
|
||||
return (uint64_t)high << 32 | low;
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
#endif
|
||||
|
||||
static uint64_t start, end;
|
||||
|
||||
2
examples/volume_rendering/.gitignore
vendored
Normal file
2
examples/volume_rendering/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
mandelbrot
|
||||
*.ppm
|
||||
35
examples/volume_rendering/Makefile
Normal file
35
examples/volume_rendering/Makefile
Normal file
@@ -0,0 +1,35 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: volume
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ volume
|
||||
|
||||
volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/volume.o: objs/volume_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
11
examples/volume_rendering/camera.dat
Normal file
11
examples/volume_rendering/camera.dat
Normal file
@@ -0,0 +1,11 @@
|
||||
896 1184
|
||||
|
||||
0.000155 0.000000 0.000000 -0.069927
|
||||
0.000000 -0.000155 0.000000 0.093236
|
||||
0.000000 0.000000 0.000000 1.000000
|
||||
0.000000 0.000000 -99.999001 100.000000
|
||||
|
||||
1.000000 0.000000 0.000000 1.000000
|
||||
0.000000 0.980129 -0.198360 2.900000
|
||||
0.000000 0.198360 0.980129 -10.500000
|
||||
0.000000 0.000000 0.000000 1.000000
|
||||
5
examples/volume_rendering/density_highres.vol
Normal file
5
examples/volume_rendering/density_highres.vol
Normal file
File diff suppressed because one or more lines are too long
4
examples/volume_rendering/density_lowres.vol
Normal file
4
examples/volume_rendering/density_lowres.vol
Normal file
File diff suppressed because one or more lines are too long
248
examples/volume_rendering/volume.cpp
Normal file
248
examples/volume_rendering/volume.cpp
Normal file
@@ -0,0 +1,248 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "volume_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
extern void volume_serial(float density[], int nVoxels[3],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]);
|
||||
|
||||
/* Write a PPM image file with the image */
|
||||
static void
|
||||
writePPM(float *buf, int width, int height, const char *fn) {
|
||||
FILE *fp = fopen(fn, "wb");
|
||||
fprintf(fp, "P6\n");
|
||||
fprintf(fp, "%d %d\n", width, height);
|
||||
fprintf(fp, "255\n");
|
||||
for (int i = 0; i < width*height; ++i) {
|
||||
float v = buf[i] * 255.f;
|
||||
if (v < 0.f) v = 0.f;
|
||||
else if (v > 255.f) v = 255.f;
|
||||
unsigned char c = (unsigned char)v;
|
||||
for (int j = 0; j < 3; ++j)
|
||||
fputc(c, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fn);
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Load image and viewing parameters from a camera data file.
|
||||
FIXME: we should add support to be able to specify viewing parameters
|
||||
in the program here directly. */
|
||||
static void
|
||||
loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
|
||||
float camera2world[4][4]) {
|
||||
FILE *f = fopen(fn, "r");
|
||||
if (!f) {
|
||||
perror(fn);
|
||||
exit(1);
|
||||
}
|
||||
if (fscanf(f, "%d %d", width, height) != 2) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
|
||||
/* Load a volume density file. Expects the number of x, y, and z samples
|
||||
as the first three values (as integer strings), then x*y*z
|
||||
floating-point values (also as strings) to give the densities. */
|
||||
static float *
|
||||
loadVolume(const char *fn, int n[3]) {
|
||||
FILE *f = fopen(fn, "r");
|
||||
if (!f) {
|
||||
perror(fn);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
|
||||
fprintf(stderr, "Couldn't find resolution at start of density file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int count = n[0] * n[1] * n[2];
|
||||
float *v = new float[count];
|
||||
for (int i = 0; i < count; ++i) {
|
||||
if (fscanf(f, "%f", &v[i]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 3) {
|
||||
fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Load viewing data and the volume density data
|
||||
//
|
||||
int width, height;
|
||||
float raster2camera[4][4], camera2world[4][4];
|
||||
loadCamera(argv[1], &width, &height, raster2camera, camera2world);
|
||||
float *image = new float[width*height];
|
||||
|
||||
int n[3];
|
||||
float *density = loadVolume(argv[2], n);
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
//
|
||||
double minISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
volume_ispc(density, n, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minISPC = std::min(minISPC, dt);
|
||||
}
|
||||
|
||||
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
|
||||
writePPM(image, width, height, "volume-ispc-1core.ppm");
|
||||
|
||||
// Clear out the buffer
|
||||
for (int i = 0; i < width * height; ++i)
|
||||
image[i] = 0.;
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation that also uses
|
||||
// tasks; report the minimum time of three runs.
|
||||
//
|
||||
double minISPCtasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
volume_ispc_tasks(density, n, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minISPCtasks = std::min(minISPCtasks, dt);
|
||||
}
|
||||
|
||||
printf("[volume ispc + tasks]:\t\t[%.3f] million cycles\n", minISPCtasks);
|
||||
writePPM(image, width, height, "volume-ispc-tasks.ppm");
|
||||
|
||||
// Clear out the buffer
|
||||
for (int i = 0; i < width * height; ++i)
|
||||
image[i] = 0.;
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
volume_serial(density, n, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minSerial = std::min(minSerial, dt);
|
||||
}
|
||||
|
||||
printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial);
|
||||
writePPM(image, width, height, "volume-serial.ppm");
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC serial, %.2fx from ISPC+tasks)\n",
|
||||
minSerial/minISPC, minSerial / minISPCtasks);
|
||||
|
||||
return 0;
|
||||
}
|
||||
385
examples/volume_rendering/volume.ispc
Normal file
385
examples/volume_rendering/volume.ispc
Normal file
@@ -0,0 +1,385 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
typedef float<3> float3;
|
||||
|
||||
struct Ray {
|
||||
float3 origin, dir;
|
||||
};
|
||||
|
||||
|
||||
static void
|
||||
generateRay(const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
float x, float y, reference Ray ray) {
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||
float camz = raster2camera[2][3];
|
||||
float camw = raster2camera[3][3];
|
||||
camx /= camw;
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
ray.origin.z = camera2world[2][3] / camera2world[3][3];
|
||||
}
|
||||
|
||||
|
||||
static inline bool
|
||||
Inside(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p.x >= pMin.x && p.x <= pMax.x &&
|
||||
p.y >= pMin.y && p.y <= pMax.y &&
|
||||
p.z >= pMin.z && p.z <= pMax.z);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
|
||||
float t0 = -1e30, t1 = 1e30;
|
||||
|
||||
float3 tNear = (pMin - ray.origin) / ray.dir;
|
||||
float3 tFar = (pMax - ray.origin) / ray.dir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
tFar.x = tmp;
|
||||
}
|
||||
t0 = max(tNear.x, t0);
|
||||
t1 = min(tFar.x, t1);
|
||||
|
||||
if (tNear.y > tFar.y) {
|
||||
float tmp = tNear.y;
|
||||
tNear.y = tFar.y;
|
||||
tFar.y = tmp;
|
||||
}
|
||||
t0 = max(tNear.y, t0);
|
||||
t1 = min(tFar.y, t1);
|
||||
|
||||
if (tNear.z > tFar.z) {
|
||||
float tmp = tNear.z;
|
||||
tNear.z = tFar.z;
|
||||
tFar.z = tmp;
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
if (t0 <= t1) {
|
||||
hit0 = t0;
|
||||
hit1 = t1;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static inline float Lerp(float t, float a, float b) {
|
||||
return (1.f - t) * a + t * b;
|
||||
}
|
||||
|
||||
|
||||
static inline float D(int x, int y, int z, uniform int nVoxels[3],
|
||||
uniform float density[]) {
|
||||
x = clamp(x, 0, nVoxels[0]-1);
|
||||
y = clamp(y, 0, nVoxels[1]-1);
|
||||
z = clamp(z, 0, nVoxels[2]-1);
|
||||
|
||||
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
|
||||
}
|
||||
|
||||
|
||||
static inline float Du(uniform int x, uniform int y, uniform int z,
|
||||
uniform int nVoxels[3], uniform float density[]) {
|
||||
x = clamp(x, 0, nVoxels[0]-1);
|
||||
y = clamp(y, 0, nVoxels[1]-1);
|
||||
z = clamp(z, 0, nVoxels[2]-1);
|
||||
|
||||
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
|
||||
}
|
||||
|
||||
|
||||
static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p - pMin) / (pMax - pMin);
|
||||
}
|
||||
|
||||
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
uniform float density[], uniform int nVoxels[3],
|
||||
reference uniform bool checkForSameVoxel) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
float3 vox = Offset(Pobj, pMin, pMax);
|
||||
vox.x = vox.x * nVoxels[0] - .5f;
|
||||
vox.y = vox.y * nVoxels[1] - .5f;
|
||||
vox.z = vox.z * nVoxels[2] - .5f;
|
||||
int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
|
||||
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
|
||||
|
||||
// Trilinearly interpolate density values to compute local density
|
||||
float d00, d10, d01, d11;
|
||||
uniform int uvx, uvy, uvz;
|
||||
if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
|
||||
reduce_equal(vz, uvz)) {
|
||||
// If all of the program instances are inside the same voxel, then
|
||||
// we'll call the 'uniform' variant of the voxel density lookup
|
||||
// function, thus doing a single load for each value rather than a
|
||||
// gather.
|
||||
d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),
|
||||
Du(uvx+1, uvy, uvz, nVoxels, density));
|
||||
d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),
|
||||
Du(uvx+1, uvy+1, uvz, nVoxels, density));
|
||||
d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),
|
||||
Du(uvx+1, uvy, uvz+1, nVoxels, density));
|
||||
d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density),
|
||||
Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
|
||||
}
|
||||
else {
|
||||
// Otherwise, we have to do an actual gather in the more general
|
||||
// D() function. Once the reduce_equal tests above fail, we stop
|
||||
// checking in subsequent steps, since it's unlikely that this will
|
||||
// be true in the future once they've diverged into different
|
||||
// voxels.
|
||||
checkForSameVoxel = false;
|
||||
d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
D(vx+1, vy, vz, nVoxels, density));
|
||||
d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
D(vx+1, vy+1, vz, nVoxels, density));
|
||||
d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
D(vx+1, vy, vz+1, nVoxels, density));
|
||||
d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
D(vx+1, vy+1, vz+1, nVoxels, density));
|
||||
}
|
||||
float d0 = Lerp(dy, d00, d10);
|
||||
float d1 = Lerp(dy, d01, d11);
|
||||
return Lerp(dz, d0, d1);
|
||||
}
|
||||
|
||||
|
||||
/* Returns the transmittance between two points p0 and p1, in a volume
|
||||
with extent (pMin,pMax) with transmittance coefficient sigma_t,
|
||||
defined by nVoxels[3] voxels in each dimension in the given density
|
||||
array. */
|
||||
static float
|
||||
transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
|
||||
uniform float3 pMax, uniform float sigma_t,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
float rayT0, rayT1;
|
||||
Ray ray;
|
||||
ray.origin = p1;
|
||||
ray.dir = p0 - p1;
|
||||
|
||||
// Find the parametric t range along the ray that is inside the volume.
|
||||
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
|
||||
return 1.;
|
||||
|
||||
rayT0 = max(rayT0, 0.f);
|
||||
|
||||
// Accumulate beam transmittance in tau
|
||||
float tau = 0;
|
||||
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
uniform float stepDist = 0.2;
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
uniform bool checkForSameVoxel = true;
|
||||
while (t < rayT1) {
|
||||
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
|
||||
checkForSameVoxel);
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
return exp(-tau);
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
distanceSquared(float3 a, float3 b) {
|
||||
float3 d = a-b;
|
||||
return d.x*d.x + d.y*d.y + d.z*d.z;
|
||||
}
|
||||
|
||||
|
||||
static float
|
||||
raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
|
||||
float rayT0, rayT1;
|
||||
uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
|
||||
uniform float3 lightPos = { -1, 4, 1.5 };
|
||||
|
||||
cif (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
|
||||
return 0.;
|
||||
|
||||
rayT0 = max(rayT0, 0.f);
|
||||
|
||||
// Parameters that define the volume scattering characteristics and
|
||||
// sampling rate for raymarching
|
||||
uniform float Le = .25; // Emission coefficient
|
||||
uniform float sigma_a = 10; // Absorption coefficient
|
||||
uniform float sigma_s = 10; // Scattering coefficient
|
||||
uniform float stepDist = 0.025; // Ray step amount
|
||||
uniform float lightIntensity = 40; // Light source intensity
|
||||
|
||||
float tau = 0.f; // accumulated beam transmittance
|
||||
float L = 0; // radiance along the ray
|
||||
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
uniform bool checkForSameVoxel = true;
|
||||
cwhile (t < rayT1) {
|
||||
float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
|
||||
|
||||
// terminate once attenuation is high
|
||||
float atten = exp(-tau);
|
||||
if (atten < .005)
|
||||
cbreak;
|
||||
|
||||
// direct lighting
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
|
||||
density, nVoxels);
|
||||
L += stepDist * atten * d * sigma_s * (Li + Le);
|
||||
|
||||
// update beam transmittance
|
||||
tau += stepDist * (sigma_a + sigma_s) * d;
|
||||
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
// Gamma correction
|
||||
return pow(L, 1.f / 2.2f);
|
||||
}
|
||||
|
||||
|
||||
/* Utility routine used by both the task-based and the single-core entrypoints.
|
||||
Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
|
||||
result into the image[] array.
|
||||
*/
|
||||
static void
|
||||
volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
uniform int y1, uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Work on 4x4=16 pixel big tiles of the image. This function thus
|
||||
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
|
||||
// by 4.
|
||||
for (uniform int y = y0; y < y1; y += 4) {
|
||||
for (uniform int x = x0; x < x1; x += 4) {
|
||||
// For each such tile, process programCount pixels at a time,
|
||||
// until we've done all 16 of them. Thus, we're also assuming
|
||||
// that programCount <= 16 and that 16 is evenly dividible by
|
||||
// programCount.
|
||||
for (uniform int o = 0; o < 16; o += programCount) {
|
||||
// These two arrays encode the mapping from [0,15] to
|
||||
// offsets within the 4x4 pixel block so that we render
|
||||
// each pixel inside the block
|
||||
const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||
const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
|
||||
// Figure out the pixel to render for this program instance
|
||||
int xo = x + xoffsets[o + programIndex];
|
||||
int yo = y + yoffsets[o + programIndex];
|
||||
|
||||
// Use viewing parameters to compute the corresponding ray
|
||||
// for the pixel
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, xo, yo, ray);
|
||||
|
||||
// And raymarch through the volume to compute the pixel's
|
||||
// value
|
||||
int offset = yo * width + xo;
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
volume_task(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
|
||||
uniform int xbuckets = (width + (dx-1)) / dx;
|
||||
uniform int ybuckets = (height + (dy-1)) / dy;
|
||||
|
||||
uniform int x0 = (taskIndex % xbuckets) * dx;
|
||||
uniform int y0 = (taskIndex / ybuckets) * dy;
|
||||
uniform int x1 = x0 + dx, y1 = y0 + dy;
|
||||
x1 = min(x1, width);
|
||||
y1 = min(y1, height);
|
||||
|
||||
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
volume_ispc(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||
uniform int dx = 8, dy = 8;
|
||||
uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||
launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image) >;
|
||||
}
|
||||
168
examples/volume_rendering/volume.vcxproj
Normal file
168
examples/volume_rendering/volume.vcxproj
Normal file
@@ -0,0 +1,168 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>volume</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="volume.cpp" />
|
||||
<ClCompile Include="volume_serial.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="volume.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
305
examples/volume_rendering/volume_serial.cpp
Normal file
305
examples/volume_rendering/volume_serial.cpp
Normal file
@@ -0,0 +1,305 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <algorithm>
|
||||
|
||||
// Just enough of a float3 class to do what we need in this file.
|
||||
#ifdef _MSC_VER
|
||||
__declspec(align(16))
|
||||
#endif
|
||||
struct float3 {
|
||||
float3() { }
|
||||
float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
|
||||
|
||||
float3 operator*(float f) const { return float3(x*f, y*f, z*f); }
|
||||
float3 operator-(const float3 &f2) const {
|
||||
return float3(x-f2.x, y-f2.y, z-f2.z);
|
||||
}
|
||||
float3 operator*(const float3 &f2) const {
|
||||
return float3(x*f2.x, y*f2.y, z*f2.z);
|
||||
}
|
||||
float3 operator+(const float3 &f2) const {
|
||||
return float3(x+f2.x, y+f2.y, z+f2.z);
|
||||
}
|
||||
float3 operator/(const float3 &f2) const {
|
||||
return float3(x/f2.x, y/f2.y, z/f2.z);
|
||||
}
|
||||
float operator[](int i) const { return (&x)[i]; }
|
||||
float &operator[](int i) { return (&x)[i]; }
|
||||
|
||||
float x, y, z;
|
||||
float pad; // match padding/alignment of ispc version
|
||||
}
|
||||
#ifndef _MSC_VER
|
||||
__attribute__ ((aligned(16)))
|
||||
#endif
|
||||
;
|
||||
|
||||
struct Ray {
|
||||
float3 origin, dir;
|
||||
};
|
||||
|
||||
|
||||
static void
|
||||
generateRay(const float raster2camera[4][4], const float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||
float camz = raster2camera[2][3];
|
||||
float camw = raster2camera[3][3];
|
||||
camx /= camw;
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
ray.origin.z = camera2world[2][3] / camera2world[3][3];
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
Inside(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p.x >= pMin.x && p.x <= pMax.x &&
|
||||
p.y >= pMin.y && p.y <= pMax.y &&
|
||||
p.z >= pMin.z && p.z <= pMax.z);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
IntersectP(const Ray &ray, float3 pMin, float3 pMax, float *hit0, float *hit1) {
|
||||
float t0 = -1e30f, t1 = 1e30f;
|
||||
|
||||
float3 tNear = (pMin - ray.origin) / ray.dir;
|
||||
float3 tFar = (pMax - ray.origin) / ray.dir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
tFar.x = tmp;
|
||||
}
|
||||
t0 = std::max(tNear.x, t0);
|
||||
t1 = std::min(tFar.x, t1);
|
||||
|
||||
if (tNear.y > tFar.y) {
|
||||
float tmp = tNear.y;
|
||||
tNear.y = tFar.y;
|
||||
tFar.y = tmp;
|
||||
}
|
||||
t0 = std::max(tNear.y, t0);
|
||||
t1 = std::min(tFar.y, t1);
|
||||
|
||||
if (tNear.z > tFar.z) {
|
||||
float tmp = tNear.z;
|
||||
tNear.z = tFar.z;
|
||||
tFar.z = tmp;
|
||||
}
|
||||
t0 = std::max(tNear.z, t0);
|
||||
t1 = std::min(tFar.z, t1);
|
||||
|
||||
if (t0 <= t1) {
|
||||
*hit0 = t0;
|
||||
*hit1 = t1;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static inline float Lerp(float t, float a, float b) {
|
||||
return (1.f - t) * a + t * b;
|
||||
}
|
||||
|
||||
|
||||
static inline int Clamp(int v, int low, int high) {
|
||||
return std::min(std::max(v, low), high);
|
||||
}
|
||||
|
||||
|
||||
static inline float D(int x, int y, int z, int nVoxels[3], float density[]) {
|
||||
x = Clamp(x, 0, nVoxels[0]-1);
|
||||
y = Clamp(y, 0, nVoxels[1]-1);
|
||||
z = Clamp(z, 0, nVoxels[2]-1);
|
||||
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
|
||||
}
|
||||
|
||||
|
||||
static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
return float3((p.x - pMin.x) / (pMax.x - pMin.x),
|
||||
(p.y - pMin.y) / (pMax.y - pMin.y),
|
||||
(p.z - pMin.z) / (pMax.z - pMin.z));
|
||||
}
|
||||
|
||||
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
float density[], int nVoxels[3]) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
float3 vox = Offset(Pobj, pMin, pMax);
|
||||
vox.x = vox.x * nVoxels[0] - .5f;
|
||||
vox.y = vox.y * nVoxels[1] - .5f;
|
||||
vox.z = vox.z * nVoxels[2] - .5f;
|
||||
int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
|
||||
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
|
||||
|
||||
// Trilinearly interpolate density values to compute local density
|
||||
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
D(vx+1, vy, vz, nVoxels, density));
|
||||
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
D(vx+1, vy+1, vz, nVoxels, density));
|
||||
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
D(vx+1, vy, vz+1, nVoxels, density));
|
||||
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
D(vx+1, vy+1, vz+1, nVoxels, density));
|
||||
float d0 = Lerp(dy, d00, d10);
|
||||
float d1 = Lerp(dy, d01, d11);
|
||||
return Lerp(dz, d0, d1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static float
|
||||
transmittance(float3 p0, float3 p1, float3 pMin,
|
||||
float3 pMax, float sigma_t, float density[], int nVoxels[3]) {
|
||||
float rayT0, rayT1;
|
||||
Ray ray;
|
||||
ray.origin = p1;
|
||||
ray.dir = p0 - p1;
|
||||
|
||||
// Find the parametric t range along the ray that is inside the volume.
|
||||
if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
|
||||
return 1.;
|
||||
|
||||
rayT0 = std::max(rayT0, 0.f);
|
||||
|
||||
// Accumulate beam transmittance in tau
|
||||
float tau = 0;
|
||||
float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
float stepDist = 0.2f;
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1) {
|
||||
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
return expf(-tau);
|
||||
}
|
||||
|
||||
|
||||
static float
|
||||
distanceSquared(float3 a, float3 b) {
|
||||
float3 d = a-b;
|
||||
return d.x*d.x + d.y*d.y + d.z*d.z;
|
||||
}
|
||||
|
||||
|
||||
static float
|
||||
raymarch(float density[], int nVoxels[3], const Ray &ray) {
|
||||
float rayT0, rayT1;
|
||||
float3 pMin(.3f, -.2f, .3f), pMax(1.8f, 2.3f, 1.8f);
|
||||
float3 lightPos(-1.f, 4.f, 1.5f);
|
||||
|
||||
if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
|
||||
return 0.;
|
||||
|
||||
rayT0 = std::max(rayT0, 0.f);
|
||||
|
||||
// Parameters that define the volume scattering characteristics and
|
||||
// sampling rate for raymarching
|
||||
float Le = .25f; // Emission coefficient
|
||||
float sigma_a = 10; // Absorption coefficient
|
||||
float sigma_s = 10; // Scattering coefficient
|
||||
float stepDist = 0.025f; // Ray step amount
|
||||
float lightIntensity = 40; // Light source intensity
|
||||
|
||||
float tau = 0.f; // accumulated beam transmittance
|
||||
float L = 0; // radiance along the ray
|
||||
float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1) {
|
||||
float d = Density(pos, pMin, pMax, density, nVoxels);
|
||||
|
||||
// terminate once attenuation is high
|
||||
float atten = expf(-tau);
|
||||
if (atten < .005f)
|
||||
break;
|
||||
|
||||
// direct lighting
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
|
||||
density, nVoxels);
|
||||
L += stepDist * atten * d * sigma_s * (Li + Le);
|
||||
|
||||
// update beam transmittance
|
||||
tau += stepDist * (sigma_a + sigma_s) * d;
|
||||
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
// Gamma correction
|
||||
return powf(L, 1.f / 2.2f);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
int offset = 0;
|
||||
for (int y = 0; y < height; ++y) {
|
||||
for (int x = 0; x < width; ++x, ++offset) {
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x, y, ray);
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
}
|
||||
}
|
||||
98
expr.h
98
expr.h
@@ -97,7 +97,7 @@ public:
|
||||
that incorporates the given error message string. In either
|
||||
failure case, NULL is returned. */
|
||||
Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL,
|
||||
bool failureOk = false);
|
||||
bool failureOk = false, bool issuePrecisionWarnings = true);
|
||||
};
|
||||
|
||||
|
||||
@@ -121,8 +121,8 @@ public:
|
||||
void Print() const;
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
const Op op;
|
||||
Expr *expr;
|
||||
};
|
||||
@@ -164,8 +164,8 @@ public:
|
||||
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
const Op op;
|
||||
Expr *arg0, *arg1;
|
||||
};
|
||||
@@ -196,8 +196,8 @@ public:
|
||||
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
const Op op;
|
||||
Expr *lvalue, *rvalue;
|
||||
};
|
||||
@@ -217,8 +217,8 @@ public:
|
||||
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
Expr *test, *expr1, *expr2;
|
||||
};
|
||||
|
||||
@@ -240,6 +240,7 @@ public:
|
||||
llvm::Constant *GetConstant(const Type *type) const;
|
||||
ExprList *Optimize();
|
||||
ExprList *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
std::vector<Expr *> exprs;
|
||||
};
|
||||
@@ -249,7 +250,8 @@ public:
|
||||
*/
|
||||
class FunctionCallExpr : public Expr {
|
||||
public:
|
||||
FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
|
||||
FunctionCallExpr(Expr *func, ExprList *args, SourcePos p,
|
||||
bool isLaunch = false, Expr *launchCountExpr = NULL);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
@@ -257,13 +259,15 @@ public:
|
||||
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
Expr *func;
|
||||
ExprList *args;
|
||||
bool isLaunch;
|
||||
Expr *launchCountExpr;
|
||||
|
||||
void resolveFunctionOverloads();
|
||||
private:
|
||||
void resolveFunctionOverloads(bool exactMatchOnly);
|
||||
bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
|
||||
};
|
||||
|
||||
@@ -285,16 +289,21 @@ public:
|
||||
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
Expr *arrayOrVector, *index;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Expression representing member selection ("foo.bar").
|
||||
*
|
||||
* This will also be overloaded to deal with swizzles.
|
||||
*/
|
||||
class MemberExpr : public Expr {
|
||||
public:
|
||||
static MemberExpr* create(Expr *expr, const char *identifier,
|
||||
SourcePos pos, SourcePos identifierPos);
|
||||
|
||||
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
||||
SourcePos identifierPos);
|
||||
|
||||
@@ -305,10 +314,11 @@ public:
|
||||
void Print() const;
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
virtual int getElementNumber() const;
|
||||
|
||||
private:
|
||||
std::string getCandidateNearMatches() const;
|
||||
int getElementNumber() const;
|
||||
|
||||
Expr *expr;
|
||||
std::string identifier;
|
||||
@@ -325,6 +335,24 @@ private:
|
||||
*/
|
||||
class ConstExpr : public Expr {
|
||||
public:
|
||||
/** Create a ConstExpr from a uniform int8 value */
|
||||
ConstExpr(const Type *t, int8_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying int8 value */
|
||||
ConstExpr(const Type *t, int8_t *i, SourcePos p);
|
||||
/** Create a ConstExpr from a uniform uint8 value */
|
||||
ConstExpr(const Type *t, uint8_t u, SourcePos p);
|
||||
/** Create a ConstExpr from a varying uint8 value */
|
||||
ConstExpr(const Type *t, uint8_t *u, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform int16 value */
|
||||
ConstExpr(const Type *t, int16_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying int16 value */
|
||||
ConstExpr(const Type *t, int16_t *i, SourcePos p);
|
||||
/** Create a ConstExpr from a uniform uint16 value */
|
||||
ConstExpr(const Type *t, uint16_t u, SourcePos p);
|
||||
/** Create a ConstExpr from a varying uint16 value */
|
||||
ConstExpr(const Type *t, uint16_t *u, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform int32 value */
|
||||
ConstExpr(const Type *t, int32_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying int32 value */
|
||||
@@ -333,14 +361,17 @@ public:
|
||||
ConstExpr(const Type *t, uint32_t u, SourcePos p);
|
||||
/** Create a ConstExpr from a varying uint32 value */
|
||||
ConstExpr(const Type *t, uint32_t *u, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform float value */
|
||||
ConstExpr(const Type *t, float f, SourcePos p);
|
||||
/** Create a ConstExpr from a varying float value */
|
||||
ConstExpr(const Type *t, float *f, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform double value */
|
||||
ConstExpr(const Type *t, double d, SourcePos p);
|
||||
/** Create a ConstExpr from a varying double value */
|
||||
ConstExpr(const Type *t, double *d, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform int64 value */
|
||||
ConstExpr(const Type *t, int64_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying int64 value */
|
||||
@@ -349,10 +380,12 @@ public:
|
||||
ConstExpr(const Type *t, uint64_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying uint64 value */
|
||||
ConstExpr(const Type *t, uint64_t *i, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform bool value */
|
||||
ConstExpr(const Type *t, bool b, SourcePos p);
|
||||
/** Create a ConstExpr from a varying bool value */
|
||||
ConstExpr(const Type *t, bool *b, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr of the same type as the given old ConstExpr,
|
||||
with values given by the "vales" parameter. */
|
||||
ConstExpr(ConstExpr *old, double *values);
|
||||
@@ -364,6 +397,7 @@ public:
|
||||
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Return the ConstExpr's values as booleans, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
@@ -371,6 +405,30 @@ public:
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsBool(bool *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as int8s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsInt8(int8_t *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as uint8s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsUInt8(uint8_t *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as int16s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsInt16(int16_t *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as uint16s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsUInt16(uint16_t *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as int32s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
@@ -417,6 +475,10 @@ private:
|
||||
|
||||
const Type *type;
|
||||
union {
|
||||
int8_t int8Val[ISPC_MAX_NVEC];
|
||||
uint8_t uint8Val[ISPC_MAX_NVEC];
|
||||
int16_t int16Val[ISPC_MAX_NVEC];
|
||||
uint16_t uint16Val[ISPC_MAX_NVEC];
|
||||
int32_t int32Val[ISPC_MAX_NVEC];
|
||||
uint32_t uint32Val[ISPC_MAX_NVEC];
|
||||
bool boolVal[ISPC_MAX_NVEC];
|
||||
@@ -439,8 +501,8 @@ public:
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
const Type *type;
|
||||
Expr *expr;
|
||||
};
|
||||
@@ -458,8 +520,8 @@ public:
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
Expr *expr;
|
||||
};
|
||||
|
||||
@@ -477,8 +539,8 @@ public:
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
Expr *expr;
|
||||
};
|
||||
|
||||
@@ -495,6 +557,7 @@ public:
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
void Print() const;
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
Symbol *symbol;
|
||||
@@ -506,7 +569,7 @@ private:
|
||||
*/
|
||||
class FunctionSymbolExpr : public Expr {
|
||||
public:
|
||||
FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions,
|
||||
FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
|
||||
SourcePos pos);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
@@ -515,10 +578,14 @@ public:
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
void Print() const;
|
||||
int EstimateCost() const;
|
||||
|
||||
private:
|
||||
friend class FunctionCallExpr;
|
||||
|
||||
/** Name of the function that is being called. */
|
||||
std::string name;
|
||||
|
||||
/** All of the functions with the name given in the function call;
|
||||
there may be more then one, in which case we need to resolve which
|
||||
overload is the best match. */
|
||||
@@ -541,6 +608,7 @@ public:
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
void Print() const;
|
||||
int EstimateCost() const;
|
||||
};
|
||||
|
||||
#endif // ISPC_EXPR_H
|
||||
|
||||
@@ -14,7 +14,7 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
|
||||
varying int3 vv = array[a];
|
||||
++vv.y;
|
||||
array[a] = vv;
|
||||
print("fin %\n", array[programIndex].y);
|
||||
//CO print("fin %\n", array[programIndex].y);
|
||||
ret[programIndex] = array[programIndex].y;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,19 +1,14 @@
|
||||
static float float4(uniform float a, uniform float b, uniform float c,
|
||||
uniform float d) {
|
||||
float ret = 0;
|
||||
for (uniform int i = 0; i < programCount; i += 4) {
|
||||
ret = insert(ret, i + 0, a);
|
||||
ret = insert(ret, i + 1, b);
|
||||
ret = insert(ret, i + 2, c);
|
||||
ret = insert(ret, i + 3, d);
|
||||
}
|
||||
return ret;
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float r[], uniform float a[]) {
|
||||
unsigned int i = (unsigned int)a[programIndex];
|
||||
r[programIndex] = max((unsigned int)2, i);
|
||||
}
|
||||
|
||||
export float f_f(float a) {
|
||||
unsigned int i = (unsigned int)a;
|
||||
return max((unsigned int)2, i);
|
||||
export void result(uniform float r[]) {
|
||||
r[programIndex] = 1+programIndex;
|
||||
r[0] = 2;
|
||||
}
|
||||
|
||||
export float result() { return float4(2,2,3,4); }
|
||||
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
|
||||
export float f_f(float a) {
|
||||
unsigned int i = (unsigned int)a;
|
||||
return max((unsigned int)10, i);
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float result[], uniform float aa[]) {
|
||||
unsigned int i = (unsigned int)aa[programIndex];
|
||||
result[programIndex] = max((unsigned int)100, i);
|
||||
}
|
||||
|
||||
export float result() { return 10; }
|
||||
export void result(uniform float r[]) { r[programIndex] = 100; }
|
||||
|
||||
|
||||
@@ -1,19 +1,14 @@
|
||||
static float float4(uniform float a, uniform float b, uniform float c,
|
||||
uniform float d) {
|
||||
float ret = 0;
|
||||
for (uniform int i = 0; i < programCount; i += 4) {
|
||||
ret = insert(ret, i + 0, a);
|
||||
ret = insert(ret, i + 1, b);
|
||||
ret = insert(ret, i + 2, c);
|
||||
ret = insert(ret, i + 3, d);
|
||||
}
|
||||
return ret;
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float result[], uniform float aa[]) {
|
||||
unsigned int i = (unsigned int)aa[programIndex];
|
||||
result[programIndex] = min((unsigned int)2, i);
|
||||
}
|
||||
|
||||
export float f_f(float a) {
|
||||
unsigned int i = (unsigned int)a;
|
||||
return min((unsigned int)2, i);
|
||||
export void result(uniform float r[]) {
|
||||
r[programIndex] = 2;
|
||||
r[0] = 1;
|
||||
}
|
||||
|
||||
export float result() { return float4(1,2,2,2); }
|
||||
|
||||
|
||||
@@ -1,19 +1,13 @@
|
||||
static float float4(uniform float a, uniform float b, uniform float c,
|
||||
uniform float d) {
|
||||
float ret = 0;
|
||||
for (uniform int i = 0; i < programCount; i += 4) {
|
||||
ret = insert(ret, i + 0, a);
|
||||
ret = insert(ret, i + 1, b);
|
||||
ret = insert(ret, i + 2, c);
|
||||
ret = insert(ret, i + 3, d);
|
||||
}
|
||||
return ret;
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float r[], uniform float a[]) {
|
||||
unsigned int i = (unsigned int)a[programIndex];
|
||||
r[programIndex] = min((unsigned int)20, i);
|
||||
}
|
||||
|
||||
export float f_f(float a) {
|
||||
unsigned int i = (unsigned int)a;
|
||||
return min((unsigned int)20, i);
|
||||
export void result(uniform float r[]) {
|
||||
r[programIndex] = 1+programIndex;
|
||||
}
|
||||
|
||||
export float result() { return float4(1,2,3,4); }
|
||||
|
||||
|
||||
16
failing_tests/shuffle2-10.ispc
Normal file
16
failing_tests/shuffle2-10.ispc
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
/* failing due to llvm bug http://llvm.org/bugs/show_bug.cgi?id=10421 */
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int8 aa = aFOO[programIndex];
|
||||
int8 bb = aa + programCount;
|
||||
int8 shuf = shuffle(aa, bb, 2*programIndex+(int)b-5);
|
||||
//CO print("%\n%\n%\n%\n", aa, bb, 2*programIndex+(int)b-5, shuf);
|
||||
RET[programIndex] = shuf;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + 2*programIndex;
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
|
||||
struct Foo {
|
||||
float f;
|
||||
};
|
||||
|
||||
|
||||
export float foo(Foo f[], int i, uniform int j) {
|
||||
Foo x = f[i];
|
||||
return x.f;
|
||||
}
|
||||
|
||||
209
ispc.cpp
209
ispc.cpp
@@ -42,14 +42,25 @@
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#include <windows.h>
|
||||
#include <direct.h>
|
||||
#define strcasecmp stricmp
|
||||
#endif
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#ifndef LLVM_2_8
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#endif
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
#include <llvm/Support/Dwarf.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/Target/TargetOptions.h>
|
||||
#include <llvm/Target/TargetData.h>
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
#include <llvm/Support/TargetRegistry.h>
|
||||
#include <llvm/Support/TargetSelect.h>
|
||||
#else
|
||||
#include <llvm/Target/TargetRegistry.h>
|
||||
#include <llvm/Target/TargetSelect.h>
|
||||
#include <llvm/Target/SubtargetFeature.h>
|
||||
#endif
|
||||
#include <llvm/Support/Host.h>
|
||||
|
||||
Globals *g;
|
||||
Module *m;
|
||||
@@ -57,20 +68,198 @@ Module *m;
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Target
|
||||
|
||||
Target::Target() {
|
||||
arch = "x86-64";
|
||||
cpu = "nehalem";
|
||||
isa = SSE4;
|
||||
nativeVectorWidth = 4;
|
||||
vectorWidth = 4;
|
||||
bool
|
||||
Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
bool pic, Target *t) {
|
||||
if (cpu == NULL) {
|
||||
std::string hostCPU = llvm::sys::getHostCPUName();
|
||||
if (hostCPU.size() > 0)
|
||||
cpu = hostCPU.c_str();
|
||||
else {
|
||||
fprintf(stderr, "Warning: unable to determine host CPU!\n");
|
||||
cpu = "generic";
|
||||
}
|
||||
}
|
||||
t->cpu = cpu;
|
||||
|
||||
if (isa == NULL) {
|
||||
if (!strcasecmp(cpu, "atom"))
|
||||
isa = "sse2";
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
else if (!strcasecmp(cpu, "sandybridge") ||
|
||||
!strcasecmp(cpu, "corei7-avx"))
|
||||
isa = "avx";
|
||||
#endif // LLVM_3_0
|
||||
else
|
||||
isa = "sse4";
|
||||
}
|
||||
if (arch == NULL)
|
||||
arch = "x86-64";
|
||||
|
||||
bool error = false;
|
||||
|
||||
t->generatePIC = pic;
|
||||
|
||||
// Make sure the target architecture is a known one; print an error
|
||||
// with the valid ones otherwise.
|
||||
t->target = NULL;
|
||||
for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
|
||||
iter != llvm::TargetRegistry::end(); ++iter) {
|
||||
if (std::string(arch) == iter->getName()) {
|
||||
t->target = &*iter;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (t->target == NULL) {
|
||||
fprintf(stderr, "Invalid architecture \"%s\"\nOptions: ", arch);
|
||||
llvm::TargetRegistry::iterator iter;
|
||||
for (iter = llvm::TargetRegistry::begin();
|
||||
iter != llvm::TargetRegistry::end(); ++iter)
|
||||
fprintf(stderr, "%s ", iter->getName());
|
||||
fprintf(stderr, "\n");
|
||||
error = true;
|
||||
}
|
||||
else {
|
||||
t->arch = arch;
|
||||
}
|
||||
|
||||
if (!strcasecmp(isa, "sse2")) {
|
||||
t->isa = Target::SSE2;
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 4;
|
||||
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse4")) {
|
||||
t->isa = Target::SSE4;
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 4;
|
||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse4x2")) {
|
||||
t->isa = Target::SSE4;
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||
}
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
else if (!strcasecmp(isa, "avx")) {
|
||||
t->isa = Target::AVX;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+avx,+popcnt,+cmov";
|
||||
}
|
||||
else if (!strcasecmp(isa, "avx-x2")) {
|
||||
t->isa = Target::AVX;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 16;
|
||||
t->attributes = "+avx,+popcnt,+cmov";
|
||||
}
|
||||
#endif // LLVM 3.0
|
||||
else {
|
||||
fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n",
|
||||
isa, SupportedTargetISAs());
|
||||
error = true;
|
||||
}
|
||||
|
||||
if (!error) {
|
||||
llvm::TargetMachine *targetMachine = t->GetTargetMachine();
|
||||
const llvm::TargetData *targetData = targetMachine->getTargetData();
|
||||
t->is32bit = (targetData->getPointerSize() == 4);
|
||||
}
|
||||
|
||||
return !error;
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
Target::SupportedTargetCPUs() {
|
||||
return "atom, barcelona, core2, corei7, "
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
"corei7-avx, "
|
||||
#endif
|
||||
"istanbul, nocona, penryn, "
|
||||
#ifdef LLVM_2_9
|
||||
"sandybridge, "
|
||||
#endif
|
||||
"westmere";
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
Target::SupportedTargetArchs() {
|
||||
return "x86, x86-64";
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
Target::SupportedTargetISAs() {
|
||||
return "sse2, sse4, sse4x2"
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
", avx, avx-x2"
|
||||
#endif
|
||||
;
|
||||
}
|
||||
|
||||
|
||||
std::string
|
||||
Target::GetTripleString() const {
|
||||
llvm::Triple triple;
|
||||
// Start with the host triple as the default
|
||||
triple.setTriple(llvm::sys::getHostTriple());
|
||||
|
||||
// And override the arch in the host triple based on what the user
|
||||
// specified. Here we need to deal with the fact that LLVM uses one
|
||||
// naming convention for targets TargetRegistry, but wants some
|
||||
// slightly different ones for the triple. TODO: is there a way to
|
||||
// have it do this remapping, which would presumably be a bit less
|
||||
// error prone?
|
||||
if (arch == "x86")
|
||||
triple.setArchName("i386");
|
||||
else if (arch == "x86-64")
|
||||
triple.setArchName("x86_64");
|
||||
else
|
||||
triple.setArchName(arch);
|
||||
|
||||
return triple.str();
|
||||
}
|
||||
|
||||
|
||||
llvm::TargetMachine *
|
||||
Target::GetTargetMachine() const {
|
||||
std::string triple = GetTripleString();
|
||||
|
||||
llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ :
|
||||
llvm::Reloc::Default;
|
||||
#if defined(LLVM_3_0svn) || defined(LLVM_3_0)
|
||||
std::string featuresString = attributes;
|
||||
llvm::TargetMachine *targetMachine =
|
||||
target->createTargetMachine(triple, cpu, featuresString, relocModel);
|
||||
#else
|
||||
#ifdef ISPC_IS_APPLE
|
||||
relocModel = llvm::Reloc::PIC_;
|
||||
#endif // ISPC_IS_APPLE
|
||||
std::string featuresString = cpu + std::string(",") + attributes;
|
||||
llvm::TargetMachine *targetMachine =
|
||||
target->createTargetMachine(triple, featuresString);
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
targetMachine->setRelocationModel(relocModel);
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
#endif
|
||||
assert(targetMachine != NULL);
|
||||
|
||||
targetMachine->setAsmVerbosityDefault(true);
|
||||
return targetMachine;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Opt
|
||||
|
||||
Opt::Opt() {
|
||||
level = 1;
|
||||
fastMath = false;
|
||||
fastMaskedVload = false;
|
||||
unrollLoops = true;
|
||||
disableBlendedMaskedStores = false;
|
||||
disableCoherentControlFlow = false;
|
||||
disableUniformControlFlow = false;
|
||||
@@ -120,13 +309,9 @@ SourcePos::SourcePos(const char *n, int l, int c) {
|
||||
}
|
||||
|
||||
llvm::DIFile SourcePos::GetDIFile() const {
|
||||
#ifdef LLVM_2_8
|
||||
return llvm::DIFile();
|
||||
#else
|
||||
std::string directory, filename;
|
||||
GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
|
||||
return m->diBuilder->createFile(filename, directory);
|
||||
#endif // LLVM_2_8
|
||||
}
|
||||
|
||||
|
||||
|
||||
76
ispc.h
76
ispc.h
@@ -69,6 +69,8 @@ namespace llvm {
|
||||
class FunctionType;
|
||||
class LLVMContext;
|
||||
class Module;
|
||||
class Target;
|
||||
class TargetMachine;
|
||||
class Type;
|
||||
class Value;
|
||||
}
|
||||
@@ -146,6 +148,8 @@ public:
|
||||
pointer in place of the original ASTNode *. */
|
||||
virtual ASTNode *TypeCheck() = 0;
|
||||
|
||||
virtual int EstimateCost() const = 0;
|
||||
|
||||
/** All AST nodes must track the file position where they are
|
||||
defined. */
|
||||
const SourcePos pos;
|
||||
@@ -156,7 +160,34 @@ public:
|
||||
This structure defines a compilation target for the ispc compiler.
|
||||
*/
|
||||
struct Target {
|
||||
Target();
|
||||
/** Initializes the given Target pointer for a target of the given
|
||||
name, if the name is a known target. Returns true if the
|
||||
target was initialized and false if the name is unknown. */
|
||||
static bool GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
bool pic, Target *);
|
||||
|
||||
/** Returns a comma-delimited string giving the names of the currently
|
||||
supported target ISAs. */
|
||||
static const char *SupportedTargetISAs();
|
||||
|
||||
/** Returns a comma-delimited string giving the names of the currently
|
||||
supported target CPUs. */
|
||||
static const char *SupportedTargetCPUs();
|
||||
|
||||
/** Returns a comma-delimited string giving the names of the currently
|
||||
supported target architectures. */
|
||||
static const char *SupportedTargetArchs();
|
||||
|
||||
/** Returns a triple string specifying the target architecture, vendor,
|
||||
and environment. */
|
||||
std::string GetTripleString() const;
|
||||
|
||||
/** Returns the LLVM TargetMachine object corresponding to this
|
||||
target. */
|
||||
llvm::TargetMachine *GetTargetMachine() const;
|
||||
|
||||
/** llvm Target object representing this target. */
|
||||
const llvm::Target *target;
|
||||
|
||||
/** Enumerator giving the instruction sets that the compiler can
|
||||
target. */
|
||||
@@ -168,9 +199,15 @@ struct Target {
|
||||
/** Target system architecture. (e.g. "x86-64", "x86"). */
|
||||
std::string arch;
|
||||
|
||||
/** Is the target architecture 32 or 64 bit */
|
||||
bool is32bit;
|
||||
|
||||
/** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
|
||||
std::string cpu;
|
||||
|
||||
/** Target-specific attributes to pass along to the LLVM backend */
|
||||
std::string attributes;
|
||||
|
||||
/** Native vector width of the vector instruction set. Note that this
|
||||
value is directly derived from the ISA Being used (e.g. it's 4 for
|
||||
SSE, 8 for AVX, etc.) */
|
||||
@@ -180,8 +217,12 @@ struct Target {
|
||||
integer multiple of the native vector width, for example if we're
|
||||
"doubling up" and compiling 8-wide on a 4-wide SSE system. */
|
||||
int vectorWidth;
|
||||
|
||||
/** Indicates whether position independent code should be generated. */
|
||||
bool generatePIC;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Structure that collects optimization options
|
||||
|
||||
This structure collects all of the options related to optimization of
|
||||
@@ -199,6 +240,16 @@ struct Opt {
|
||||
should be performed. This is false by default. */
|
||||
bool fastMath;
|
||||
|
||||
/** Indicates whether an vector load should be issued for masked loads
|
||||
on platforms that don't have a native masked vector load. (This may
|
||||
lead to accessing memory up to programCount-1 elements past the end of
|
||||
arrays, so is unsafe in general.) */
|
||||
bool fastMaskedVload;
|
||||
|
||||
/** Indicates when loops should be unrolled (when doing so seems like
|
||||
it will make sense. */
|
||||
bool unrollLoops;
|
||||
|
||||
/** On targets that don't have a masked store instruction but do have a
|
||||
blending instruction, by default, we simulate masked stores by
|
||||
loading the old value, blending, and storing the result. This can
|
||||
@@ -316,6 +367,29 @@ struct Globals {
|
||||
std::vector<std::string> cppArgs;
|
||||
};
|
||||
|
||||
enum {
|
||||
COST_ASSIGN = 1,
|
||||
COST_COHERENT_BREAK_CONTINE = 4,
|
||||
COST_COMPLEX_ARITH_OP = 4,
|
||||
COST_DEREF = 4,
|
||||
COST_FUNCALL = 4,
|
||||
COST_GATHER = 8,
|
||||
COST_LOAD = 2,
|
||||
COST_REGULAR_BREAK_CONTINUE = 2,
|
||||
COST_RETURN = 4,
|
||||
COST_SELECT = 4,
|
||||
COST_SIMPLE_ARITH_LOGIC_OP = 1,
|
||||
COST_SYNC = 32,
|
||||
COST_TASK_LAUNCH = 16,
|
||||
COST_TYPECAST_COMPLEX = 4,
|
||||
COST_TYPECAST_SIMPLE = 1,
|
||||
COST_UNIFORM_LOOP = 4,
|
||||
COST_VARYING_LOOP = 6,
|
||||
|
||||
CHECK_MASK_AT_FUNCTION_START_COST = 16,
|
||||
PREDICATE_SAFE_IF_STATEMENT_COST = 6,
|
||||
};
|
||||
|
||||
extern Globals *g;
|
||||
extern Module *m;
|
||||
|
||||
|
||||
87
ispc.vcxproj
87
ispc.vcxproj
@@ -16,7 +16,9 @@
|
||||
<ClCompile Include="decl.cpp" />
|
||||
<ClCompile Include="expr.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4x2.cpp" />
|
||||
@@ -28,13 +30,15 @@
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="opt.cpp" />
|
||||
<ClCompile Include="parse.cc" />
|
||||
<CustomBuild Include="stdlib-c.c">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c > gen-bitcode-c.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang stdlib-c.c</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c > gen-bitcode-c.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang stdlib-c.c</Message>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
|
||||
<CustomBuild Include="builtins-c.c">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
||||
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
||||
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
|
||||
</CustomBuild>
|
||||
<ClCompile Include="stmt.cpp" />
|
||||
<ClCompile Include="sym.cpp" />
|
||||
@@ -59,66 +63,79 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib-sse4.ll">
|
||||
<CustomBuild Include="builtins-sse4.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib-sse4x2.ll">
|
||||
<CustomBuild Include="builtins-sse4x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib-sse2.ll">
|
||||
<CustomBuild Include="builtins-sse2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib-avx.ll">
|
||||
<CustomBuild Include="builtins-avx.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins-avx-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll > gen-bitcode-avx-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll > gen-bitcode-avx-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="lex.ll">
|
||||
<FileType>Document</FileType>
|
||||
@@ -179,7 +196,7 @@
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
@@ -187,7 +204,7 @@
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
@@ -197,7 +214,7 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
@@ -207,10 +224,10 @@
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
222
ispc_test.cpp
222
ispc_test.cpp
@@ -33,12 +33,25 @@
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <memory.h>
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_HAVE_SVML
|
||||
#include <xmmintrin.h>
|
||||
@@ -61,8 +74,14 @@ extern "C" {
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/ExecutionEngine/ExecutionEngine.h>
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
#include <llvm/Support/TargetRegistry.h>
|
||||
#include <llvm/Support/TargetSelect.h>
|
||||
#else
|
||||
#include <llvm/Target/TargetRegistry.h>
|
||||
#include <llvm/Target/TargetSelect.h>
|
||||
#endif
|
||||
#include <llvm/ExecutionEngine/JIT.h>
|
||||
#include <llvm/Target/TargetSelect.h>
|
||||
#include <llvm/Target/TargetOptions.h>
|
||||
#include <llvm/Target/TargetData.h>
|
||||
#include <llvm/Transforms/Scalar.h>
|
||||
@@ -74,42 +93,53 @@ extern "C" {
|
||||
#include <llvm/Support/raw_ostream.h>
|
||||
#include <llvm/Bitcode/ReaderWriter.h>
|
||||
#include <llvm/Support/MemoryBuffer.h>
|
||||
#ifndef LLVM_2_8
|
||||
#include <llvm/Support/system_error.h>
|
||||
#endif
|
||||
|
||||
bool shouldFail = false;
|
||||
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *, void *);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
void ISPCLaunch(void **, void *, void *, int32_t);
|
||||
void ISPCSync(void *);
|
||||
void *ISPCAlloc(void **, int64_t size, int32_t alignment);
|
||||
}
|
||||
|
||||
void ISPCLaunch(void *func, void *data) {
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
void ISPCLaunch(void **handle, void *func, void *data, int32_t count) {
|
||||
*handle = (void *)0xdeadbeef;
|
||||
typedef void (*TaskFuncType)(void *, int, int, int, int);
|
||||
TaskFuncType tft = (TaskFuncType)(func);
|
||||
tft(data, 0, 1);
|
||||
for (int i = 0; i < count; ++i)
|
||||
tft(data, 0, 1, i, count);
|
||||
}
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
void ISPCSync(void *) {
|
||||
}
|
||||
|
||||
|
||||
void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
|
||||
*handle = (void *)0xdeadbeef;
|
||||
// leak time!
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
return _aligned_malloc(size, alignment);
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void usage(int ret) {
|
||||
fprintf(stderr, "usage: ispc_test\n");
|
||||
fprintf(stderr, "\t[-h/--help]\tprint help\n");
|
||||
fprintf(stderr, "\t[-f]\t\tindicates that test is expected to fail\n");
|
||||
fprintf(stderr, "\t<files>\n");
|
||||
exit(ret);
|
||||
}
|
||||
@@ -119,20 +149,22 @@ static void svml_missing() {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// On Windows, sin() is an overloaded function, so we need an unambiguous
|
||||
// function we can take the address of when wiring up the external references
|
||||
// below.
|
||||
|
||||
double Sin(double x) { return sin(x); }
|
||||
double Cos(double x) { return cos(x); }
|
||||
double Tan(double x) { return tan(x); }
|
||||
double Atan(double x) { return atan(x); }
|
||||
double Atan2(double y, double x) { return atan2(y, x); }
|
||||
double Pow(double a, double b) { return pow(a, b); }
|
||||
double Exp(double x) { return exp(x); }
|
||||
double Log(double x) { return log(x); }
|
||||
|
||||
static bool lRunTest(const char *fn) {
|
||||
llvm::LLVMContext *ctx = new llvm::LLVMContext;
|
||||
|
||||
#ifdef LLVM_2_8
|
||||
std::string err;
|
||||
llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
|
||||
if (!buf) {
|
||||
fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
|
||||
delete ctx;
|
||||
return false;
|
||||
}
|
||||
std::string bcErr;
|
||||
llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
|
||||
#else
|
||||
llvm::OwningPtr<llvm::MemoryBuffer> buf;
|
||||
llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
|
||||
if (err) {
|
||||
@@ -142,7 +174,6 @@ static bool lRunTest(const char *fn) {
|
||||
}
|
||||
std::string bcErr;
|
||||
llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
|
||||
#endif
|
||||
|
||||
if (!module) {
|
||||
fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
|
||||
@@ -151,45 +182,58 @@ static bool lRunTest(const char *fn) {
|
||||
}
|
||||
|
||||
std::string eeError;
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::EngineBuilder engineBuilder(module);
|
||||
engineBuilder.setErrorStr(&eeError);
|
||||
engineBuilder.setEngineKind(llvm::EngineKind::JIT);
|
||||
#if 0
|
||||
std::vector<std::string> attributes;
|
||||
if (target != NULL && !strcmp(target, "avx"))
|
||||
attributes.push_back("+avx");
|
||||
engineBuilder.setMAttrs(attributes);
|
||||
engineBuilder.setUseMCJIT(true);
|
||||
#endif
|
||||
llvm::ExecutionEngine *ee = engineBuilder.create();
|
||||
#else
|
||||
llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
|
||||
#endif
|
||||
if (!ee) {
|
||||
fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
llvm::Function *func;
|
||||
if ((func = module->getFunction("ISPCLaunch")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)ISPCLaunch);
|
||||
if ((func = module->getFunction("ISPCSync")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)ISPCSync);
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
if ((func = module->getFunction("ISPCMalloc")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)ISPCMalloc);
|
||||
if ((func = module->getFunction("ISPCFree")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)ISPCFree);
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
if ((func = module->getFunction("putchar")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)putchar);
|
||||
if ((func = module->getFunction("printf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)printf);
|
||||
if ((func = module->getFunction("fflush")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)fflush);
|
||||
if ((func = module->getFunction("sinf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)sinf);
|
||||
if ((func = module->getFunction("cosf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)cosf);
|
||||
if ((func = module->getFunction("tanf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)tanf);
|
||||
if ((func = module->getFunction("atanf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)atanf);
|
||||
if ((func = module->getFunction("atan2f")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)atan2f);
|
||||
if ((func = module->getFunction("powf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)powf);
|
||||
if ((func = module->getFunction("expf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)expf);
|
||||
if ((func = module->getFunction("logf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)logf);
|
||||
#define DO_FUNC(FUNC ,FUNCNAME) \
|
||||
if ((func = module->getFunction(FUNCNAME)) != NULL) \
|
||||
ee->addGlobalMapping(func, (void *)FUNC)
|
||||
DO_FUNC(ISPCLaunch, "ISPCLaunch");
|
||||
DO_FUNC(ISPCSync, "ISPCSync");
|
||||
DO_FUNC(ISPCAlloc, "ISPCAlloc");
|
||||
DO_FUNC(putchar, "putchar");
|
||||
DO_FUNC(printf, "printf");
|
||||
DO_FUNC(fflush, "fflush");
|
||||
DO_FUNC(sinf, "sinf");
|
||||
DO_FUNC(cosf, "cosf");
|
||||
DO_FUNC(tanf, "tanf");
|
||||
DO_FUNC(atanf, "atanf");
|
||||
DO_FUNC(atan2f, "atan2f");
|
||||
DO_FUNC(powf, "powf");
|
||||
DO_FUNC(expf, "expf");
|
||||
DO_FUNC(logf, "logf");
|
||||
DO_FUNC(Sin, "sin");
|
||||
DO_FUNC(Cos, "cos");
|
||||
DO_FUNC(Tan, "tan");
|
||||
DO_FUNC(Atan, "atan");
|
||||
DO_FUNC(Atan2, "atan2");
|
||||
DO_FUNC(Pow, "pow");
|
||||
DO_FUNC(Exp, "exp");
|
||||
DO_FUNC(Log, "log");
|
||||
DO_FUNC(memset, "memset");
|
||||
#ifdef ISPC_IS_APPLE
|
||||
DO_FUNC(memset_pattern4, "memset_pattern4");
|
||||
DO_FUNC(memset_pattern8, "memset_pattern8");
|
||||
DO_FUNC(memset_pattern16, "memset_pattern16");
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_HAVE_SVML
|
||||
#define DO_SVML(FUNC ,FUNCNAME) \
|
||||
@@ -231,7 +275,6 @@ static bool lRunTest(const char *fn) {
|
||||
float result[16];
|
||||
for (int i = 0; i < 16; ++i)
|
||||
result[i] = 0;
|
||||
bool ok = true;
|
||||
if (foundResult) {
|
||||
typedef void (*PFN)(float *);
|
||||
PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
|
||||
@@ -288,50 +331,49 @@ static bool lRunTest(const char *fn) {
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
|
||||
ok = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
// see if we got the right result
|
||||
if (ok) {
|
||||
if (foundResult) {
|
||||
for (int i = 0; i < width; ++i)
|
||||
if (returned[i] != result[i]) {
|
||||
ok = false;
|
||||
fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
|
||||
fn, i, returned[i], returned[i], result[i], result[i]);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i = 0; i < width; ++i)
|
||||
fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
|
||||
fn, i, returned[i], returned[i]);
|
||||
}
|
||||
bool resultsMatch = true;
|
||||
if (foundResult) {
|
||||
for (int i = 0; i < width; ++i)
|
||||
if (returned[i] != result[i]) {
|
||||
resultsMatch = false;
|
||||
fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
|
||||
fn, i, returned[i], returned[i], result[i], result[i]);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i = 0; i < width; ++i)
|
||||
fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
|
||||
fn, i, returned[i], returned[i]);
|
||||
}
|
||||
if (foundResult && shouldFail && resultsMatch)
|
||||
fprintf(stderr, "Test %s unexpectedly passed\n", fn);
|
||||
|
||||
delete ee;
|
||||
delete ctx;
|
||||
|
||||
return ok && foundResult;
|
||||
return foundResult && resultsMatch;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
llvm::InitializeNativeTarget();
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
LLVMLinkInJIT();
|
||||
#endif
|
||||
|
||||
std::vector<const char *> files;
|
||||
const char *filename = NULL;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
|
||||
usage(0);
|
||||
if (!strcmp(argv[i], "-f"))
|
||||
shouldFail = true;
|
||||
else
|
||||
files.push_back(argv[i]);
|
||||
filename = argv[i];
|
||||
}
|
||||
|
||||
int passes = 0, fails = 0;
|
||||
for (unsigned int i = 0; i < files.size(); ++i) {
|
||||
if (lRunTest(files[i])) ++passes;
|
||||
else ++fails;
|
||||
}
|
||||
|
||||
if (fails > 0)
|
||||
fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails);
|
||||
return fails > 0;
|
||||
return (lRunTest(filename) == true) ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -52,14 +52,14 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
@@ -70,7 +70,7 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -79,10 +79,10 @@
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
3
lex.ll
3
lex.ll
@@ -72,6 +72,7 @@ FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?
|
||||
HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
|
||||
|
||||
IDENT [a-zA-Z_][a-zA-Z_0-9]*
|
||||
ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
|
||||
|
||||
%%
|
||||
"/*" { lCComment(yylloc); }
|
||||
@@ -104,6 +105,8 @@ goto { return TOKEN_GOTO; }
|
||||
if { return TOKEN_IF; }
|
||||
inline { return TOKEN_INLINE; }
|
||||
int { return TOKEN_INT; }
|
||||
int8 { return TOKEN_INT8; }
|
||||
int16 { return TOKEN_INT16; }
|
||||
int32 { return TOKEN_INT; }
|
||||
int64 { return TOKEN_INT64; }
|
||||
launch { return TOKEN_LAUNCH; }
|
||||
|
||||
158
llvmutil.cpp
158
llvmutil.cpp
@@ -41,28 +41,39 @@
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
|
||||
LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16Type = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32Type = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64Type = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoublePointerType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::MaskType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
|
||||
|
||||
llvm::Constant *LLVMTrue = NULL;
|
||||
@@ -75,16 +86,20 @@ void
|
||||
InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
|
||||
LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
|
||||
|
||||
LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
|
||||
LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
|
||||
LLVMTypes::Int16Type = llvm::Type::getInt16Ty(*ctx);
|
||||
LLVMTypes::Int32Type = llvm::Type::getInt32Ty(*ctx);
|
||||
LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
|
||||
LLVMTypes::Int64Type = llvm::Type::getInt64Ty(*ctx);
|
||||
LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
|
||||
LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
|
||||
LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
|
||||
LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);
|
||||
|
||||
LLVMTypes::Int8PointerType = llvm::PointerType::get(LLVMTypes::Int8Type, 0);
|
||||
LLVMTypes::Int16PointerType = llvm::PointerType::get(LLVMTypes::Int16Type, 0);
|
||||
LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
|
||||
LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
|
||||
LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
|
||||
LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
|
||||
|
||||
// Note that both the mask and bool vectors are vector of int32s
|
||||
@@ -95,18 +110,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
|
||||
LLVMTypes::Int1VectorType =
|
||||
llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
|
||||
LLVMTypes::Int8VectorType =
|
||||
llvm::VectorType::get(LLVMTypes::Int8Type, target.vectorWidth);
|
||||
LLVMTypes::Int16VectorType =
|
||||
llvm::VectorType::get(LLVMTypes::Int16Type, target.vectorWidth);
|
||||
LLVMTypes::Int32VectorType =
|
||||
llvm::VectorType::get(LLVMTypes::Int32Type, target.vectorWidth);
|
||||
LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
|
||||
LLVMTypes::Int64VectorType =
|
||||
llvm::VectorType::get(LLVMTypes::Int64Type, target.vectorWidth);
|
||||
LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
|
||||
LLVMTypes::FloatVectorType =
|
||||
llvm::VectorType::get(LLVMTypes::FloatType, target.vectorWidth);
|
||||
LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
|
||||
LLVMTypes::DoubleVectorType =
|
||||
llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
|
||||
|
||||
LLVMTypes::Int8VectorPointerType = llvm::PointerType::get(LLVMTypes::Int8VectorType, 0);
|
||||
LLVMTypes::Int16VectorPointerType = llvm::PointerType::get(LLVMTypes::Int16VectorType, 0);
|
||||
LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
|
||||
LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
|
||||
LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
|
||||
LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
|
||||
|
||||
LLVMTypes::VoidPointerVectorType =
|
||||
llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
|
||||
|
||||
@@ -133,7 +156,36 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *LLVMInt32(int32_t ival) {
|
||||
llvm::ConstantInt *
|
||||
LLVMInt8(int8_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
|
||||
true /*signed*/);
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *
|
||||
LLVMUInt8(uint8_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
|
||||
false /*unsigned*/);
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *
|
||||
LLVMInt16(int16_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
|
||||
true /*signed*/);
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *
|
||||
LLVMUInt16(uint16_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
|
||||
false /*unsigned*/);
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *
|
||||
LLVMInt32(int32_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
|
||||
true /*signed*/);
|
||||
}
|
||||
@@ -172,6 +224,82 @@ LLVMDouble(double dval) {
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt8Vector(int8_t ival) {
|
||||
llvm::Constant *v = LLVMInt8(ival);
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(v);
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt8Vector(const int8_t *ivec) {
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(LLVMInt8(ivec[i]));
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMUInt8Vector(uint8_t ival) {
|
||||
llvm::Constant *v = LLVMUInt8(ival);
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(v);
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMUInt8Vector(const uint8_t *ivec) {
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(LLVMUInt8(ivec[i]));
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt16Vector(int16_t ival) {
|
||||
llvm::Constant *v = LLVMInt16(ival);
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(v);
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt16Vector(const int16_t *ivec) {
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(LLVMInt16(ivec[i]));
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMUInt16Vector(uint16_t ival) {
|
||||
llvm::Constant *v = LLVMUInt16(ival);
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(v);
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMUInt16Vector(const uint16_t *ivec) {
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(LLVMUInt16(ivec[i]));
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt32Vector(int32_t ival) {
|
||||
llvm::Constant *v = LLVMInt32(ival);
|
||||
@@ -325,8 +453,8 @@ LLVMBoolVector(const bool *bvec) {
|
||||
}
|
||||
|
||||
|
||||
const llvm::ArrayType *
|
||||
LLVMPointerVectorType(const llvm::Type *t) {
|
||||
LLVM_TYPE_CONST llvm::ArrayType *
|
||||
LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t) {
|
||||
// NOTE: ArrayType, not VectorType
|
||||
return llvm::ArrayType::get(llvm::PointerType::get(t, 0),
|
||||
g->target.vectorWidth);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user