Compare commits
100 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2f35bc1a0f | ||
|
|
1620e0508d | ||
|
|
cb7976bbf6 | ||
|
|
5ee4d7fce8 | ||
|
|
8f3e46f67e | ||
|
|
9ed07ff2b5 | ||
|
|
32a0a30cf5 | ||
|
|
6d39d5fc3e | ||
|
|
c999c8a237 | ||
|
|
aad269fdf4 | ||
|
|
d45c536c47 | ||
|
|
f1b8e5b1bf | ||
|
|
e7a70b05af | ||
|
|
cf73286938 | ||
|
|
e6f80c0adc | ||
|
|
5e31d7b6d0 | ||
|
|
649f2ad7b7 | ||
|
|
fade1cdf1d | ||
|
|
d261105a86 | ||
|
|
b3d3e8987b | ||
|
|
4e91f3777a | ||
|
|
5584240c7f | ||
|
|
7126a39092 | ||
|
|
8ad28a3f6f | ||
|
|
9921b8e530 | ||
|
|
9052d4b10b | ||
|
|
2405dae8e6 | ||
|
|
3607f3e045 | ||
|
|
de84acfa5d | ||
|
|
a501ab1aa6 | ||
|
|
cdc850f98c | ||
|
|
ca87579f23 | ||
|
|
38fc13d1ab | ||
|
|
cf9d9f717e | ||
|
|
173632f446 | ||
|
|
1dedd88132 | ||
|
|
0848c2cc19 | ||
|
|
e2a88d491f | ||
|
|
30f9dcd4f5 | ||
|
|
0c344b6755 | ||
|
|
6734021520 | ||
|
|
dd153d3c5c | ||
|
|
9ca7541d52 | ||
|
|
0c20483853 | ||
|
|
9d4ff1bc06 | ||
|
|
83f22f1939 | ||
|
|
6375ed9224 | ||
|
|
cf23cf9ef4 | ||
|
|
1147b53dcd | ||
|
|
4cf831a651 | ||
|
|
785d8a29d3 | ||
|
|
46d2bad231 | ||
|
|
32da8e11b4 | ||
|
|
5dedb6f836 | ||
|
|
2ea6d249d5 | ||
|
|
c86128e8ee | ||
|
|
375f1cb8e8 | ||
|
|
3ca7b6b078 | ||
|
|
effe901890 | ||
|
|
4f451bd041 | ||
|
|
c76ef7b174 | ||
|
|
743d82e935 | ||
|
|
18546e9c6d | ||
|
|
f24ab16b91 | ||
|
|
766b34683c | ||
|
|
b5bfa43e92 | ||
|
|
99221f7d17 | ||
|
|
eb7913f1dd | ||
|
|
08cad7a665 | ||
|
|
9cd92facbd | ||
|
|
85063f493c | ||
|
|
f65a20c700 | ||
|
|
e144724979 | ||
|
|
96a297c747 | ||
|
|
67e00b97c6 | ||
|
|
a94cabc692 | ||
|
|
ad9e66650d | ||
|
|
6de494cfdb | ||
|
|
58e34ba4ae | ||
|
|
33feeffe5d | ||
|
|
d0db46aac5 | ||
|
|
da76396c75 | ||
|
|
bbf3fb6307 | ||
|
|
4ab982bc16 | ||
|
|
34301e09f5 | ||
|
|
84e586e767 | ||
|
|
72a2f5d2f4 | ||
|
|
606cbab0d4 | ||
|
|
54ec56c81d | ||
|
|
a322398c62 | ||
|
|
f22b3a25bd | ||
|
|
b67498766e | ||
|
|
c340ff3893 | ||
|
|
b0f59777d4 | ||
|
|
e14208f489 | ||
|
|
7756265503 | ||
|
|
f841b775c3 | ||
|
|
8c921544a0 | ||
|
|
fe54f1ad8e | ||
|
|
74c2c8ae07 |
16
Makefile
16
Makefile
@@ -10,7 +10,12 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
|
|||||||
-lclangSerialization -lclangParse -lclangSema \
|
-lclangSerialization -lclangParse -lclangSema \
|
||||||
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
|
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
|
||||||
|
|
||||||
LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
|
ISPC_LIBS=$(CLANG_LIBS) \
|
||||||
|
$(shell llvm-config --ldflags --libs) \
|
||||||
|
-lpthread -ldl
|
||||||
|
ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
|
||||||
|
-lpthread -ldl
|
||||||
|
|
||||||
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
|
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
|
||||||
LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
|
LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
|
||||||
LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
|
LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
|
||||||
@@ -44,7 +49,8 @@ CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
|
|||||||
util.cpp
|
util.cpp
|
||||||
HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
|
HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
|
||||||
opt.h stmt.h sym.h type.h util.h
|
opt.h stmt.h sym.h type.h util.h
|
||||||
BUILTINS_SRC=builtins-avx.ll builtins-sse2.ll builtins-sse4.ll builtins-sse4x2.ll
|
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
|
||||||
|
builtins-sse4.ll builtins-sse4x2.ll
|
||||||
BISON_SRC=parse.yy
|
BISON_SRC=parse.yy
|
||||||
FLEX_SRC=lex.ll
|
FLEX_SRC=lex.ll
|
||||||
|
|
||||||
@@ -79,11 +85,11 @@ doxygen:
|
|||||||
|
|
||||||
ispc: print_llvm_src dirs $(OBJS)
|
ispc: print_llvm_src dirs $(OBJS)
|
||||||
@echo Creating ispc executable
|
@echo Creating ispc executable
|
||||||
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(CLANG_LIBS) $(LLVM_LIBS)
|
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
|
||||||
|
|
||||||
ispc_test: dirs ispc_test.cpp
|
ispc_test: dirs ispc_test.cpp
|
||||||
@echo Creating ispc_test executable
|
@echo Creating ispc_test executable
|
||||||
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
|
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
@echo Compiling $<
|
@echo Compiling $<
|
||||||
@@ -105,7 +111,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
|||||||
@echo Compiling $<
|
@echo Compiling $<
|
||||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||||
|
|
||||||
objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll
|
objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
|
||||||
@echo Creating C++ source from builtin definitions file $<
|
@echo Creating C++ source from builtin definitions file $<
|
||||||
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
||||||
|
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ code.
|
|||||||
|
|
||||||
ispc is an open source compiler under the BSD license; see the file
|
ispc is an open source compiler under the BSD license; see the file
|
||||||
LICENSE.txt. ispc supports Windows, Mac, and Linux, with both x86 and
|
LICENSE.txt. ispc supports Windows, Mac, and Linux, with both x86 and
|
||||||
x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
|
x86-64 targets. It currently supports the SSE2, SSE4, and AVX instruction
|
||||||
though support for AVX should be available soon.
|
sets.
|
||||||
|
|
||||||
For more information and examples, as well as a wiki and the bug database,
|
For more information and examples, as well as a wiki and the bug database,
|
||||||
see the ispc distribution site, http://ispc.github.com.
|
see the ispc distribution site, http://ispc.github.com.
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ import sys
|
|||||||
import string
|
import string
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import platform
|
||||||
|
import os
|
||||||
|
|
||||||
length=0
|
length=0
|
||||||
|
|
||||||
@@ -14,8 +16,12 @@ target = re.sub("\.ll$", "", target)
|
|||||||
target = re.sub("\.c$", "", target)
|
target = re.sub("\.c$", "", target)
|
||||||
target = re.sub("-", "_", target)
|
target = re.sub("-", "_", target)
|
||||||
|
|
||||||
|
llvm_as="llvm-as"
|
||||||
|
if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
|
||||||
|
llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
|
||||||
|
|
||||||
try:
|
try:
|
||||||
as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
|
as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||||
except IOError:
|
except IOError:
|
||||||
print >> sys.stderr, "Couldn't open " + src
|
print >> sys.stderr, "Couldn't open " + src
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|||||||
278
builtins-avx-common.ll
Normal file
278
builtins-avx-common.ll
Normal file
@@ -0,0 +1,278 @@
|
|||||||
|
;; Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
;; All rights reserved.
|
||||||
|
;;
|
||||||
|
;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;; modification, are permitted provided that the following conditions are
|
||||||
|
;; met:
|
||||||
|
;;
|
||||||
|
;; * Redistributions of source code must retain the above copyright
|
||||||
|
;; notice, this list of conditions and the following disclaimer.
|
||||||
|
;;
|
||||||
|
;; * Redistributions in binary form must reproduce the above copyright
|
||||||
|
;; notice, this list of conditions and the following disclaimer in the
|
||||||
|
;; documentation and/or other materials provided with the distribution.
|
||||||
|
;;
|
||||||
|
;; * Neither the name of Intel Corporation nor the names of its
|
||||||
|
;; contributors may be used to endorse or promote products derived from
|
||||||
|
;; this software without specific prior written permission.
|
||||||
|
;;
|
||||||
|
;;
|
||||||
|
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; *** Untested *** AVX target implementation.
|
||||||
|
;;
|
||||||
|
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||||
|
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||||
|
;; chance that there are bugs in the code in this file.
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rcp
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; uniform float iv = extract(__rcp_u(v), 0);
|
||||||
|
; return iv * (2. - v * iv);
|
||||||
|
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||||
|
%scall = extractelement <4 x float> %call, i32 0
|
||||||
|
|
||||||
|
; do one N-R iteration
|
||||||
|
%v_iv = fmul float %0, %scall
|
||||||
|
%two_minus = fsub float 2., %v_iv
|
||||||
|
%iv_mul = fmul float %scall, %two_minus
|
||||||
|
ret float %iv_mul
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rounding floats
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
|
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
|
; the roundss intrinsic is a total mess--docs say:
|
||||||
|
;
|
||||||
|
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||||
|
;
|
||||||
|
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||||
|
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||||
|
; return value is described by the following equations:
|
||||||
|
;
|
||||||
|
; r0 = RND(b0)
|
||||||
|
; r1 = a1
|
||||||
|
; r2 = a2
|
||||||
|
; r3 = a3
|
||||||
|
;
|
||||||
|
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||||
|
; here. So we pass the same register for both.
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rounding doubles
|
||||||
|
|
||||||
|
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||||
|
|
||||||
|
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
|
ret double %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
|
ret double %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
|
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
|
ret double %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rsqrt
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||||
|
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||||
|
%is = extractelement <4 x float> %vis, i32 0
|
||||||
|
|
||||||
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
|
%v_is = fmul float %0, %is
|
||||||
|
%v_is_is = fmul float %v_is, %is
|
||||||
|
%three_sub = fsub float 3., %v_is_is
|
||||||
|
%is_mul = fmul float %is, %three_sub
|
||||||
|
%half_scale = fmul float 0.5, %is_mul
|
||||||
|
ret float %half_scale
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; sqrt
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; fastmath
|
||||||
|
|
||||||
|
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||||
|
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||||
|
|
||||||
|
define internal void @__fastmath() nounwind alwaysinline {
|
||||||
|
%ptr = alloca i32
|
||||||
|
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||||
|
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||||
|
%oldval = load i32 *%ptr
|
||||||
|
|
||||||
|
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||||
|
%update = or i32 %oldval, 32832
|
||||||
|
store i32 %update, i32 *%ptr
|
||||||
|
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; float min/max
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; int min/max
|
||||||
|
|
||||||
|
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
|
||||||
|
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; unsigned int min/max
|
||||||
|
|
||||||
|
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
|
||||||
|
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
; horizontal ops
|
||||||
|
|
||||||
|
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||||
|
|
||||||
|
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||||
|
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||||
|
ret i32 %call
|
||||||
|
}
|
||||||
|
|
||||||
|
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||||
|
|
||||||
|
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||||
|
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||||
|
ret i64 %call
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; double precision sqrt
|
||||||
|
|
||||||
|
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||||
|
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; double precision min/max
|
||||||
|
|
||||||
|
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
665
builtins-avx-x2.ll
Normal file
665
builtins-avx-x2.ll
Normal file
@@ -0,0 +1,665 @@
|
|||||||
|
;; Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
;; All rights reserved.
|
||||||
|
;;
|
||||||
|
;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;; modification, are permitted provided that the following conditions are
|
||||||
|
;; met:
|
||||||
|
;;
|
||||||
|
;; * Redistributions of source code must retain the above copyright
|
||||||
|
;; notice, this list of conditions and the following disclaimer.
|
||||||
|
;;
|
||||||
|
;; * Redistributions in binary form must reproduce the above copyright
|
||||||
|
;; notice, this list of conditions and the following disclaimer in the
|
||||||
|
;; documentation and/or other materials provided with the distribution.
|
||||||
|
;;
|
||||||
|
;; * Neither the name of Intel Corporation nor the names of its
|
||||||
|
;; contributors may be used to endorse or promote products derived from
|
||||||
|
;; this software without specific prior written permission.
|
||||||
|
;;
|
||||||
|
;;
|
||||||
|
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; *** Untested *** AVX target implementation.
|
||||||
|
;;
|
||||||
|
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||||
|
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||||
|
;; chance that there are bugs in the code in this file.
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; Basic 16-wide definitions
|
||||||
|
|
||||||
|
stdlib_core(16)
|
||||||
|
packed_load_and_store(16)
|
||||||
|
scans(16)
|
||||||
|
int64minmax(16)
|
||||||
|
|
||||||
|
include(`builtins-avx-common.ll')
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rcp
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
|
; float iv = __rcp_v(v);
|
||||||
|
; return iv * (2. - v * iv);
|
||||||
|
|
||||||
|
unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
|
||||||
|
; do one N-R iteration
|
||||||
|
%v_iv = fmul <16 x float> %0, %call
|
||||||
|
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
|
||||||
|
float 2., float 2., float 2., float 2.,
|
||||||
|
float 2., float 2., float 2., float 2.,
|
||||||
|
float 2., float 2., float 2., float 2.>, %v_iv
|
||||||
|
%iv_mul = fmul <16 x float> %call, %two_minus
|
||||||
|
ret <16 x float> %iv_mul
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rounding floats
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
|
define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
|
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
|
round8to16(%0, 8)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
|
round8to16(%0, 9)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
|
round8to16(%0, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rounding doubles
|
||||||
|
|
||||||
|
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||||
|
|
||||||
|
define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||||
|
round4to16double(%0, 8)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||||
|
round4to16double(%0, 9)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||||
|
round4to16double(%0, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rsqrt
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||||
|
; float is = __rsqrt_v(v);
|
||||||
|
unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
|
||||||
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
|
%v_is = fmul <16 x float> %v, %is
|
||||||
|
%v_is_is = fmul <16 x float> %v_is, %is
|
||||||
|
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
|
||||||
|
float 3., float 3., float 3., float 3.,
|
||||||
|
float 3., float 3., float 3., float 3.,
|
||||||
|
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||||
|
%is_mul = fmul <16 x float> %is, %three_sub
|
||||||
|
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||||
|
float 0.5, float 0.5, float 0.5, float 0.5,
|
||||||
|
float 0.5, float 0.5, float 0.5, float 0.5,
|
||||||
|
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||||
|
ret <16 x float> %half_scale
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; sqrt
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
|
unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
|
||||||
|
ret <16 x float> %call
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; svml
|
||||||
|
|
||||||
|
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||||
|
; or, use the macro to call the 4-wide ones 4x with our 16-wide
|
||||||
|
; vectors...
|
||||||
|
|
||||||
|
declare <16 x float> @__svml_sin(<16 x float>)
|
||||||
|
declare <16 x float> @__svml_cos(<16 x float>)
|
||||||
|
declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
|
||||||
|
declare <16 x float> @__svml_tan(<16 x float>)
|
||||||
|
declare <16 x float> @__svml_atan(<16 x float>)
|
||||||
|
declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
|
||||||
|
declare <16 x float> @__svml_exp(<16 x float>)
|
||||||
|
declare <16 x float> @__svml_log(<16 x float>)
|
||||||
|
declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; float min/max
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
|
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <16 x float> @__max_varying_float(<16 x float>,
|
||||||
|
<16 x float>) nounwind readonly alwaysinline {
|
||||||
|
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
|
||||||
|
ret <16 x float> %call
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <16 x float> @__min_varying_float(<16 x float>,
|
||||||
|
<16 x float>) nounwind readonly alwaysinline {
|
||||||
|
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
|
||||||
|
ret <16 x float> %call
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; int min/max
|
||||||
|
|
||||||
|
define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||||
|
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
|
ret <16 x i32> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||||
|
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
|
ret <16 x i32> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; unsigned int min/max
|
||||||
|
|
||||||
|
define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||||
|
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
|
ret <16 x i32> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||||
|
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
|
ret <16 x i32> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
; horizontal ops
|
||||||
|
|
||||||
|
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
|
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||||
|
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
|
||||||
|
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
|
||||||
|
|
||||||
|
%v1shift = shl i32 %v1, 8
|
||||||
|
%v = or i32 %v1shift, %v0
|
||||||
|
ret i32 %v
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; horizontal float ops
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
|
%va = shufflevector <16 x float> %0, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%vb = shufflevector <16 x float> %0, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
|
||||||
|
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||||
|
%v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
|
||||||
|
%scalar1 = extractelement <8 x float> %v3, i32 0
|
||||||
|
%scalar2 = extractelement <8 x float> %v3, i32 4
|
||||||
|
%sum = fadd float %scalar1, %scalar2
|
||||||
|
ret float %sum
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(float, @__min_varying_float, @__min_uniform_float)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(float, @__max_varying_float, @__max_uniform_float)
|
||||||
|
}
|
||||||
|
|
||||||
|
reduce_equal(16)
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; horizontal int32 ops
|
||||||
|
|
||||||
|
define internal <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||||
|
<16 x i32>) nounwind readnone alwaysinline {
|
||||||
|
%s = add <16 x i32> %0, %1
|
||||||
|
ret <16 x i32> %s
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||||
|
%s = add i32 %0, %1
|
||||||
|
ret i32 %s
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; horizontal uint32 ops
|
||||||
|
|
||||||
|
define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||||
|
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
|
||||||
|
ret i32 %r
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; horizontal double ops
|
||||||
|
|
||||||
|
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
|
||||||
|
%va = shufflevector <16 x double> %0, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%vb = shufflevector <16 x double> %0, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%vc = shufflevector <16 x double> %0, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%vd = shufflevector <16 x double> %0, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%vab = fadd <4 x double> %va, %vb
|
||||||
|
%vcd = fadd <4 x double> %vc, %vd
|
||||||
|
|
||||||
|
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
|
||||||
|
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||||
|
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||||
|
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||||
|
%sum = fadd double %final0, %final1
|
||||||
|
ret double %sum
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(double, @__min_varying_double, @__min_uniform_double)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(double, @__max_varying_double, @__max_uniform_double)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; horizontal int64 ops
|
||||||
|
|
||||||
|
define internal <16 x i64> @__add_varying_int64(<16 x i64>,
|
||||||
|
<16 x i64>) nounwind readnone alwaysinline {
|
||||||
|
%s = add <16 x i64> %0, %1
|
||||||
|
ret <16 x i64> %s
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||||
|
%s = add i64 %0, %1
|
||||||
|
ret i64 %s
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; horizontal uint64 ops
|
||||||
|
|
||||||
|
define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||||
|
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
|
||||||
|
ret i64 %r
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
|
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; unaligned loads/loads+broadcasts
|
||||||
|
|
||||||
|
load_and_broadcast(16, i8, 8)
|
||||||
|
load_and_broadcast(16, i16, 16)
|
||||||
|
load_and_broadcast(16, i32, 32)
|
||||||
|
load_and_broadcast(16, i64, 64)
|
||||||
|
|
||||||
|
; no masked load instruction for i8 and i16 types??
|
||||||
|
load_masked(16, i8, 8, 1)
|
||||||
|
load_masked(16, i16, 16, 2)
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||||
|
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||||
|
|
||||||
|
define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%floatmask = bitcast <16 x i32> %mask to <16 x float>
|
||||||
|
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
|
||||||
|
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%ptr1 = getelementptr i8 * %0, i32 32 ;; 8x4 bytes = 32
|
||||||
|
%val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
|
||||||
|
|
||||||
|
%retval = shufflevector <8 x float> %val0, <8 x float> %val1,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%reti32 = bitcast <16 x float> %retval to <16 x i32>
|
||||||
|
ret <16 x i32> %reti32
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||||
|
; double up masks, bitcast to doubles
|
||||||
|
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||||
|
%mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||||
|
%mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
|
||||||
|
%mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
|
||||||
|
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||||
|
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||||
|
%mask2d = bitcast <8 x i32> %mask2 to <4 x double>
|
||||||
|
%mask3d = bitcast <8 x i32> %mask3 to <4 x double>
|
||||||
|
|
||||||
|
%val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
|
||||||
|
%ptr1 = getelementptr i8 * %0, i32 32
|
||||||
|
%val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
|
||||||
|
%ptr2 = getelementptr i8 * %0, i32 64
|
||||||
|
%val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
|
||||||
|
%ptr3 = getelementptr i8 * %0, i32 96
|
||||||
|
%val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
|
||||||
|
|
||||||
|
%val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%val23 = shufflevector <4 x double> %val2d, <4 x double> %val3d,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%val0123 = shufflevector <8 x double> %val01, <8 x double> %val23,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%val = bitcast <16 x double> %val0123 to <16 x i64>
|
||||||
|
ret <16 x i64> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; masked store
|
||||||
|
|
||||||
|
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||||
|
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||||
|
; of the mask to zero... Not sure if this would be a win in the end
|
||||||
|
gen_masked_store(16, i8, 8)
|
||||||
|
gen_masked_store(16, i16, 16)
|
||||||
|
|
||||||
|
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||||
|
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||||
|
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||||
|
|
||||||
|
define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
||||||
|
<16 x i32>) nounwind alwaysinline {
|
||||||
|
%ptr = bitcast <16 x i32> * %0 to i8 *
|
||||||
|
%val = bitcast <16 x i32> %1 to <16 x float>
|
||||||
|
%mask = bitcast <16 x i32> %2 to <16 x float>
|
||||||
|
|
||||||
|
%val0 = shufflevector <16 x float> %val, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%val1 = shufflevector <16 x float> %val, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
%mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
|
||||||
|
%ptr1 = getelementptr i8 * %ptr, i32 32
|
||||||
|
call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
|
||||||
|
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
||||||
|
<16 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%ptr = bitcast <16 x i64> * %0 to i8 *
|
||||||
|
%val = bitcast <16 x i64> %1 to <16 x double>
|
||||||
|
|
||||||
|
; double up masks, bitcast to doubles
|
||||||
|
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||||
|
%mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||||
|
%mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
|
||||||
|
%mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
|
||||||
|
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||||
|
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||||
|
%mask2d = bitcast <8 x i32> %mask2 to <4 x double>
|
||||||
|
%mask3d = bitcast <8 x i32> %mask3 to <4 x double>
|
||||||
|
|
||||||
|
%val0 = shufflevector <16 x double> %val, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%val1 = shufflevector <16 x double> %val, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%val2 = shufflevector <16 x double> %val, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%val3 = shufflevector <16 x double> %val, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
|
||||||
|
%ptr1 = getelementptr i8 * %ptr, i32 32
|
||||||
|
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
|
||||||
|
%ptr2 = getelementptr i8 * %ptr, i32 64
|
||||||
|
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
|
||||||
|
%ptr3 = getelementptr i8 * %ptr, i32 96
|
||||||
|
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
|
||||||
|
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
masked_store_blend_8_16_by_16()
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||||
|
<8 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
||||||
|
<16 x i32>) nounwind alwaysinline {
|
||||||
|
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
|
||||||
|
%oldValue = load <16 x i32>* %0, align 4
|
||||||
|
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
|
||||||
|
%newAsFloat = bitcast <16 x i32> %1 to <16 x float>
|
||||||
|
|
||||||
|
%old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
%blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
|
||||||
|
<8 x float> %new0,
|
||||||
|
<8 x float> %mask0)
|
||||||
|
%blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
|
||||||
|
<8 x float> %new1,
|
||||||
|
<8 x float> %mask1)
|
||||||
|
%blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%blendAsInt = bitcast <16 x float> %blend to <16 x i32>
|
||||||
|
store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||||
|
<4 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||||
|
<16 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%oldValue = load <16 x i64>* %ptr, align 8
|
||||||
|
%old = bitcast <16 x i64> %oldValue to <16 x double>
|
||||||
|
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%old1d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%old2d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%old3d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
%new = bitcast <16 x i64> %newi64 to <16 x double>
|
||||||
|
%new0d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%new1d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%new2d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%new3d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||||
|
%mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||||
|
%mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
|
||||||
|
%mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
|
||||||
|
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||||
|
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||||
|
%mask2d = bitcast <8 x i32> %mask2 to <4 x double>
|
||||||
|
%mask3d = bitcast <8 x i32> %mask3 to <4 x double>
|
||||||
|
|
||||||
|
%result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
|
||||||
|
<4 x double> %new0d, <4 x double> %mask0d)
|
||||||
|
%result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
|
||||||
|
<4 x double> %new1d, <4 x double> %mask1d)
|
||||||
|
%result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
|
||||||
|
<4 x double> %new2d, <4 x double> %mask2d)
|
||||||
|
%result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
|
||||||
|
<4 x double> %new3d, <4 x double> %mask3d)
|
||||||
|
|
||||||
|
%result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
|
||||||
|
%result = shufflevector <8 x double> %result01, <8 x double> %result23,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%result64 = bitcast <16 x double> %result to <16 x i64>
|
||||||
|
store <16 x i64> %result64, <16 x i64> * %ptr
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; gather/scatter
|
||||||
|
|
||||||
|
gen_gather(16, i8)
|
||||||
|
gen_gather(16, i16)
|
||||||
|
gen_gather(16, i32)
|
||||||
|
gen_gather(16, i64)
|
||||||
|
|
||||||
|
gen_scatter(16, i8)
|
||||||
|
gen_scatter(16, i16)
|
||||||
|
gen_scatter(16, i32)
|
||||||
|
gen_scatter(16, i64)
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; double precision sqrt
|
||||||
|
|
||||||
|
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
|
||||||
|
unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||||
|
ret <16 x double> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; double precision min/max
|
||||||
|
|
||||||
|
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
|
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||||
|
binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||||
|
ret <16 x double> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||||
|
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||||
|
ret <16 x double> %ret
|
||||||
|
}
|
||||||
219
builtins-avx.ll
219
builtins-avx.ll
@@ -44,11 +44,12 @@ packed_load_and_store(8)
|
|||||||
scans(8)
|
scans(8)
|
||||||
int64minmax(8)
|
int64minmax(8)
|
||||||
|
|
||||||
|
include(`builtins-avx-common.ll')
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rcp
|
;; rcp
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; float iv = __rcp_v(v);
|
; float iv = __rcp_v(v);
|
||||||
@@ -63,25 +64,10 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
|
|||||||
ret <8 x float> %iv_mul
|
ret <8 x float> %iv_mul
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; uniform float iv = extract(__rcp_u(v), 0);
|
|
||||||
; return iv * (2. - v * iv);
|
|
||||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
|
||||||
%scall = extractelement <4 x float> %call, i32 0
|
|
||||||
|
|
||||||
; do one N-R iteration
|
|
||||||
%v_iv = fmul float %0, %scall
|
|
||||||
%two_minus = fsub float 2., %v_iv
|
|
||||||
%iv_mul = fmul float %scall, %two_minus
|
|
||||||
ret float %iv_mul
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding floats
|
;; rounding floats
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||||
declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
@@ -89,111 +75,43 @@ define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonl
|
|||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
|
||||||
; the roundss intrinsic is a total mess--docs say:
|
|
||||||
;
|
|
||||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
|
||||||
;
|
|
||||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
|
||||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
|
||||||
; return value is described by the following equations:
|
|
||||||
;
|
|
||||||
; r0 = RND(b0)
|
|
||||||
; r1 = a1
|
|
||||||
; r2 = a2
|
|
||||||
; r3 = a3
|
|
||||||
;
|
|
||||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
|
||||||
; here. So we pass the same register for both.
|
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
|
||||||
ret float %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
|
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; see above for round_ss instrinsic discussion...
|
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
|
||||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
|
||||||
ret float %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
|
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; see above for round_ss instrinsic discussion...
|
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
|
||||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
|
||||||
ret float %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding doubles
|
;; rounding doubles
|
||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
round4to8double(%0, 8)
|
round4to8double(%0, 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
|
||||||
ret double %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||||
round4to8double(%0, 9)
|
round4to8double(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
; see above for round_ss instrinsic discussion...
|
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
|
||||||
ret double %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||||
round4to8double(%0, 10)
|
round4to8double(%0, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
; see above for round_ss instrinsic discussion...
|
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
|
||||||
ret double %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rsqrt
|
;; rsqrt
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||||
; float is = __rsqrt_v(v);
|
; float is = __rsqrt_v(v);
|
||||||
@@ -201,64 +119,24 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
|
|||||||
; return 0.5 * is * (3. - (v * is) * is);
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
%v_is = fmul <8 x float> %v, %is
|
%v_is = fmul <8 x float> %v, %is
|
||||||
%v_is_is = fmul <8 x float> %v_is, %is
|
%v_is_is = fmul <8 x float> %v_is, %is
|
||||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
|
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
|
||||||
|
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||||
%is_mul = fmul <8 x float> %is, %three_sub
|
%is_mul = fmul <8 x float> %is, %three_sub
|
||||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||||
|
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||||
ret <8 x float> %half_scale
|
ret <8 x float> %half_scale
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
|
||||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
|
||||||
%is = extractelement <4 x float> %vis, i32 0
|
|
||||||
|
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
|
||||||
%v_is = fmul float %0, %is
|
|
||||||
%v_is_is = fmul float %v_is, %is
|
|
||||||
%three_sub = fsub float 3., %v_is_is
|
|
||||||
%is_mul = fmul float %is, %three_sub
|
|
||||||
%half_scale = fmul float 0.5, %is_mul
|
|
||||||
ret float %half_scale
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; sqrt
|
;; sqrt
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
|
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; fastmath
|
|
||||||
|
|
||||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
|
||||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
|
||||||
|
|
||||||
define internal void @__fastmath() nounwind alwaysinline {
|
|
||||||
%ptr = alloca i32
|
|
||||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
|
||||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
|
||||||
%oldval = load i32 *%ptr
|
|
||||||
|
|
||||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
|
||||||
%update = or i32 %oldval, 32832
|
|
||||||
store i32 %update, i32 *%ptr
|
|
||||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; svml
|
;; svml
|
||||||
|
|
||||||
@@ -280,9 +158,7 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
|
|||||||
;; float min/max
|
;; float min/max
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x float> @__max_varying_float(<8 x float>,
|
define internal <8 x float> @__max_varying_float(<8 x float>,
|
||||||
<8 x float>) nounwind readonly alwaysinline {
|
<8 x float>) nounwind readonly alwaysinline {
|
||||||
@@ -290,94 +166,43 @@ define internal <8 x float> @__max_varying_float(<8 x float>,
|
|||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define internal <8 x float> @__min_varying_float(<8 x float>,
|
define internal <8 x float> @__min_varying_float(<8 x float>,
|
||||||
<8 x float>) nounwind readonly alwaysinline {
|
<8 x float>) nounwind readonly alwaysinline {
|
||||||
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; int min/max
|
;; int min/max
|
||||||
|
|
||||||
; no 8-wide integer stuff in avx1...
|
|
||||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
|
||||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
ret <8 x i32> %ret
|
ret <8 x i32> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
|
||||||
ret i32 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
ret <8 x i32> %ret
|
ret <8 x i32> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
|
||||||
ret i32 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; unsigned int min/max
|
;; unsigned int min/max
|
||||||
|
|
||||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
|
||||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
ret <8 x i32> %ret
|
ret <8 x i32> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
|
||||||
ret i32 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
ret <8 x i32> %ret
|
ret <8 x i32> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
|
||||||
ret i32 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; horizontal ops
|
; horizontal ops
|
||||||
|
|
||||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
|
||||||
|
|
||||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
|
||||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
|
||||||
ret i32 %call
|
|
||||||
}
|
|
||||||
|
|
||||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
|
||||||
|
|
||||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
|
||||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
|
||||||
ret i64 %call
|
|
||||||
}
|
|
||||||
|
|
||||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
@@ -471,9 +296,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
|
|||||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
||||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||||
%scalar1 = extractelement <4 x double> %sum0, i32 0
|
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||||
%scalar2 = extractelement <4 x double> %sum1, i32 1
|
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||||
%sum = fadd double %scalar1, %scalar2
|
%sum = fadd double %final0, %final1
|
||||||
|
|
||||||
ret double %sum
|
ret double %sum
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -623,12 +449,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
masked_store_blend_8_16_by_8()
|
masked_store_blend_8_16_by_8()
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||||
<8 x float>) nounwind readnone
|
<8 x float>) nounwind readnone
|
||||||
|
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32>) nounwind alwaysinline {
|
||||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||||
@@ -694,6 +521,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; gather/scatter
|
;; gather/scatter
|
||||||
|
|
||||||
@@ -711,43 +539,26 @@ gen_scatter(8, i64)
|
|||||||
;; double precision sqrt
|
;; double precision sqrt
|
||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||||
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
|
||||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; double precision min/max
|
;; double precision min/max
|
||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
declare <2 x double> @llvm.x86.sse.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
declare <2 x double> @llvm.x86.sse.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||||
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse.min.sd, %0, %1)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||||
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse.max.sd, %0, %1)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|||||||
29
builtins-c.c
29
builtins-c.c
@@ -51,6 +51,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif // !_MSC_VER
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
@@ -139,3 +143,28 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
|||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int __num_cores() {
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
// This is quite a hack. Including all of windows.h to get this definition
|
||||||
|
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||||
|
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||||
|
// here. Presumably this struct declaration won't be changing in the future
|
||||||
|
// anyway...
|
||||||
|
struct SYSTEM_INFO {
|
||||||
|
int pad0[2];
|
||||||
|
void *pad1[2];
|
||||||
|
int *pad2;
|
||||||
|
int dwNumberOfProcessors;
|
||||||
|
int pad3[3];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SYSTEM_INFO sysInfo;
|
||||||
|
extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
|
||||||
|
GetSystemInfo(&sysInfo);
|
||||||
|
return sysInfo.dwNumberOfProcessors;
|
||||||
|
#else
|
||||||
|
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||||
|
#endif // !_MSC_VER
|
||||||
|
}
|
||||||
|
|||||||
@@ -277,41 +277,17 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; horizontal ops / reductions
|
; horizontal ops / reductions
|
||||||
|
|
||||||
; FIXME: this is very inefficient, loops over all 32 bits...
|
declare i32 @llvm.ctpop.i32(i32)
|
||||||
|
declare i64 @llvm.ctpop.i64(i64)
|
||||||
; we could use the LLVM intrinsic declare i32 @llvm.ctpop.i32(i32),
|
|
||||||
; although that currently ends up generating a POPCNT instruction even
|
|
||||||
; if we give --target=sse2 on the command line. We probably need to
|
|
||||||
; pipe through the 'sse2' request to LLVM via the 'features' string
|
|
||||||
; at codegen time... (If e.g. --cpu=penryn is also passed along, then
|
|
||||||
; it does generate non-POPCNT code and in particular better code than
|
|
||||||
; the below does.)
|
|
||||||
|
|
||||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||||
entry:
|
%val = call i32 @llvm.ctpop.i32(i32 %0)
|
||||||
br label %loop
|
ret i32 %val
|
||||||
|
|
||||||
loop:
|
|
||||||
%count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
|
|
||||||
%val = phi i32 [ %0, %entry ], [ %newval, %loop ]
|
|
||||||
%delta = and i32 %val, 1
|
|
||||||
%newcount = add i32 %count, %delta
|
|
||||||
%newval = lshr i32 %val, 1
|
|
||||||
%done = icmp eq i32 %newval, 0
|
|
||||||
br i1 %done, label %exit, label %loop
|
|
||||||
|
|
||||||
exit:
|
|
||||||
ret i32 %newcount
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||||
%vec = bitcast i64 %0 to <2 x i32>
|
%val = call i64 @llvm.ctpop.i64(i64 %0)
|
||||||
%v0 = extractelement <2 x i32> %vec, i32 0
|
ret i64 %val
|
||||||
%v1 = extractelement <2 x i32> %vec, i32 1
|
|
||||||
%c0 = call i32 @__popcnt_int32(i32 %v0)
|
|
||||||
%c1 = call i32 @__popcnt_int32(i32 %v1)
|
|
||||||
%sum = add i32 %c0, %c1
|
|
||||||
ret i32 %sum
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
|
|||||||
}
|
}
|
||||||
|
|
||||||
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
||||||
ret <4 x float> %call
|
ret <4 x float> %call
|
||||||
}
|
}
|
||||||
@@ -85,14 +85,14 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl
|
|||||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
|
||||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
ret float %rs
|
ret float %rs
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
||||||
ret <4 x float> %call
|
ret <4 x float> %call
|
||||||
}
|
}
|
||||||
@@ -100,7 +100,7 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly
|
|||||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
ret float %rs
|
ret float %rs
|
||||||
@@ -124,28 +124,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
|
|||||||
}
|
}
|
||||||
|
|
||||||
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
round2to4double(%0, 9)
|
round2to4double(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
ret double %rs
|
ret double %rs
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
round2to4double(%0, 10)
|
round2to4double(%0, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
ret double %rs
|
ret double %rs
|
||||||
|
|||||||
@@ -498,28 +498,28 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
|
|||||||
}
|
}
|
||||||
|
|
||||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
round4to8(%0, 9)
|
round4to8(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
ret float %rs
|
ret float %rs
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
round4to8(%0, 10)
|
round4to8(%0, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
ret float %rs
|
ret float %rs
|
||||||
@@ -543,28 +543,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
|
|||||||
}
|
}
|
||||||
|
|
||||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
round2to8double(%0, 9)
|
round2to8double(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
ret double %rs
|
ret double %rs
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
round2to8double(%0, 10)
|
round2to8double(%0, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
ret double %rs
|
ret double %rs
|
||||||
|
|||||||
37
builtins.cpp
37
builtins.cpp
@@ -55,7 +55,7 @@
|
|||||||
#include <llvm/Intrinsics.h>
|
#include <llvm/Intrinsics.h>
|
||||||
#include <llvm/Linker.h>
|
#include <llvm/Linker.h>
|
||||||
#include <llvm/Target/TargetMachine.h>
|
#include <llvm/Target/TargetMachine.h>
|
||||||
#include <llvm/Target/SubtargetFeature.h>
|
#include <llvm/ADT/Triple.h>
|
||||||
#include <llvm/Support/MemoryBuffer.h>
|
#include <llvm/Support/MemoryBuffer.h>
|
||||||
#include <llvm/Bitcode/ReaderWriter.h>
|
#include <llvm/Bitcode/ReaderWriter.h>
|
||||||
|
|
||||||
@@ -389,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||||
|
SymbolTable *symbolTable) {
|
||||||
|
std::vector<const Type *> args;
|
||||||
|
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||||
|
Symbol *sym = new Symbol(name, SourcePos(), ft);
|
||||||
|
sym->isStatic = true;
|
||||||
|
|
||||||
|
llvm::Function *func = module->getFunction(name);
|
||||||
|
assert(func != NULL); // it should be declared already...
|
||||||
|
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||||
|
llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
|
||||||
|
llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
|
||||||
|
|
||||||
|
sym->function = func;
|
||||||
|
symbolTable->AddVariable(sym);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||||
@@ -454,11 +475,23 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case Target::AVX:
|
case Target::AVX:
|
||||||
|
switch (g->target.vectorWidth) {
|
||||||
|
case 8:
|
||||||
extern unsigned char builtins_bitcode_avx[];
|
extern unsigned char builtins_bitcode_avx[];
|
||||||
extern int builtins_bitcode_avx_length;
|
extern int builtins_bitcode_avx_length;
|
||||||
lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module,
|
lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module,
|
||||||
symbolTable);
|
symbolTable);
|
||||||
break;
|
break;
|
||||||
|
case 16:
|
||||||
|
extern unsigned char builtins_bitcode_avx_x2[];
|
||||||
|
extern int builtins_bitcode_avx_x2_length;
|
||||||
|
lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||||
|
module, symbolTable);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
FATAL("logic error in DefineStdlib");
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
FATAL("logic error");
|
FATAL("logic error");
|
||||||
}
|
}
|
||||||
@@ -480,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
symbolTable);
|
symbolTable);
|
||||||
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
||||||
symbolTable);
|
symbolTable);
|
||||||
|
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
|
||||||
|
symbolTable);
|
||||||
|
|
||||||
if (includeStdlibISPC) {
|
if (includeStdlibISPC) {
|
||||||
// If the user wants the standard library to be included, parse the
|
// If the user wants the standard library to be included, parse the
|
||||||
|
|||||||
482
builtins.m4
482
builtins.m4
@@ -111,6 +111,32 @@ define(`reduce8', `
|
|||||||
'
|
'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
define(`reduce16', `
|
||||||
|
%v1 = shufflevector <16 x $1> %0, <16 x $1> undef,
|
||||||
|
<16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
|
||||||
|
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||||
|
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||||
|
%m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0)
|
||||||
|
%v2 = shufflevector <16 x $1> %m1, <16 x $1> undef,
|
||||||
|
<16 x i32> <i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||||
|
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||||
|
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||||
|
%m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1)
|
||||||
|
%v3 = shufflevector <16 x $1> %m2, <16 x $1> undef,
|
||||||
|
<16 x i32> <i32 2, i32 3, i32 undef, i32 undef,
|
||||||
|
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||||
|
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||||
|
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||||
|
%m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2)
|
||||||
|
|
||||||
|
%m3a = extractelement <16 x $1> %m3, i32 0
|
||||||
|
%m3b = extractelement <16 x $1> %m3, i32 1
|
||||||
|
%m = call $1 $3($1 %m3a, $1 %m3b)
|
||||||
|
ret $1 %m
|
||||||
|
'
|
||||||
|
)
|
||||||
|
|
||||||
;; Do an reduction over an 8-wide vector, using a vector reduction function
|
;; Do an reduction over an 8-wide vector, using a vector reduction function
|
||||||
;; that only takes 4-wide vectors
|
;; that only takes 4-wide vectors
|
||||||
;; $1: type of final scalar result
|
;; $1: type of final scalar result
|
||||||
@@ -211,6 +237,45 @@ define(`unary4to8', `
|
|||||||
'
|
'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
define(`unary4to16', `
|
||||||
|
%$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
|
||||||
|
%$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
|
||||||
|
%$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
|
||||||
|
%$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
|
||||||
|
|
||||||
|
%$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
'
|
||||||
|
)
|
||||||
|
|
||||||
|
;; And so forth...
|
||||||
|
;; $1: name of variable into which the final result should go
|
||||||
|
;; $2: scalar type of the vector elements
|
||||||
|
;; $3: 8-wide unary vector function to apply
|
||||||
|
;; $4: 16-wide operand value
|
||||||
|
|
||||||
|
define(`unary8to16', `
|
||||||
|
%$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0)
|
||||||
|
%$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1)
|
||||||
|
%$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
'
|
||||||
|
)
|
||||||
|
|
||||||
;; And along the lines of `binary2to4', this maps a 4-wide binary function to
|
;; And along the lines of `binary2to4', this maps a 4-wide binary function to
|
||||||
;; two 8-wide vector operands
|
;; two 8-wide vector operands
|
||||||
;; $1: name of variable into which the final result should go
|
;; $1: name of variable into which the final result should go
|
||||||
@@ -231,6 +296,57 @@ define(`binary4to8', `
|
|||||||
'
|
'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
define(`binary8to16', `
|
||||||
|
%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b)
|
||||||
|
%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b)
|
||||||
|
%$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
'
|
||||||
|
)
|
||||||
|
|
||||||
|
define(`binary4to16', `
|
||||||
|
%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b)
|
||||||
|
|
||||||
|
%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b)
|
||||||
|
|
||||||
|
%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b)
|
||||||
|
|
||||||
|
%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b)
|
||||||
|
|
||||||
|
%r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
|
||||||
|
%$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
')
|
||||||
|
|
||||||
;; Maps a 2-wide unary function to an 8-wide vector operand, returning an
|
;; Maps a 2-wide unary function to an 8-wide vector operand, returning an
|
||||||
;; 8-wide vector result
|
;; 8-wide vector result
|
||||||
@@ -306,6 +422,20 @@ ret <8 x float> %ret
|
|||||||
'
|
'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
define(`round8to16', `
|
||||||
|
%v0 = shufflevector <16 x float> $1, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%v1 = shufflevector <16 x float> $1, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2)
|
||||||
|
%r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2)
|
||||||
|
%ret = shufflevector <8 x float> %r0, <8 x float> %r1,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
ret <16 x float> %ret
|
||||||
|
'
|
||||||
|
)
|
||||||
|
|
||||||
define(`round4to8double', `
|
define(`round4to8double', `
|
||||||
%v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
%v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
%v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
%v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
@@ -349,6 +479,30 @@ ret <8 x double> %ret
|
|||||||
'
|
'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
define(`round4to16double', `
|
||||||
|
%v0 = shufflevector <16 x double> $1, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%v1 = shufflevector <16 x double> $1, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%v2 = shufflevector <16 x double> $1, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%v3 = shufflevector <16 x double> $1, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
|
||||||
|
%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
|
||||||
|
%r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2)
|
||||||
|
%r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2)
|
||||||
|
%ret0 = shufflevector <4 x double> %r0, <4 x double> %r1,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%ret1 = shufflevector <4 x double> %r2, <4 x double> %r3,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%ret = shufflevector <8 x double> %ret0, <8 x double> %ret1,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
ret <16 x double> %ret
|
||||||
|
'
|
||||||
|
)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; forloop macro
|
;; forloop macro
|
||||||
|
|
||||||
@@ -468,12 +622,91 @@ forloop(i, 1, eval($1-1), `
|
|||||||
}
|
}
|
||||||
')
|
')
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; global_atomic
|
;; global_atomic_associative
|
||||||
|
;; More efficient implementation for atomics that are associative (e.g.,
|
||||||
|
;; add, and, ...). If a basic implementation would do sometihng like:
|
||||||
|
;; result0 = atomic_op(ptr, val0)
|
||||||
|
;; result1 = atomic_op(ptr, val1)
|
||||||
|
;; ..
|
||||||
|
;; Then instead we can do:
|
||||||
|
;; tmp = (val0 op val1 op ...)
|
||||||
|
;; result0 = atomic_op(ptr, tmp)
|
||||||
|
;; result1 = (result0 op val0)
|
||||||
|
;; ..
|
||||||
|
;; And more efficiently compute the same result
|
||||||
|
;;
|
||||||
|
;; Takes five parameters:
|
||||||
|
;; $1: vector width of the target
|
||||||
|
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
|
||||||
|
;; (add, sub...)
|
||||||
|
;; $3: return type of the LLVM atomic (e.g. i32)
|
||||||
|
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||||
|
;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
|
||||||
|
|
||||||
|
define(`global_atomic_associative', `
|
||||||
|
|
||||||
|
define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
||||||
|
<$1 x i32> %m) nounwind alwaysinline {
|
||||||
|
; first, for any lanes where the mask is off, compute a vector where those lanes
|
||||||
|
; hold the identity value..
|
||||||
|
|
||||||
|
; for the bit tricks below, we need the mask to be sign extended to be
|
||||||
|
; the size of the element type.
|
||||||
|
ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>')
|
||||||
|
ifelse($3, `i32', `
|
||||||
|
; silly workaround to do %mask = %m, which is not possible directly..
|
||||||
|
%maskmem = alloca <$1 x i32>
|
||||||
|
store <$1 x i32> %m, <$1 x i32> * %maskmem
|
||||||
|
%mask = load <$1 x i32> * %maskmem'
|
||||||
|
)
|
||||||
|
; zero out any lanes that are off
|
||||||
|
%valoff = and <$1 x $3> %val, %mask
|
||||||
|
|
||||||
|
; compute an identity vector that is zero in on lanes and has the identiy value
|
||||||
|
; in the off lanes
|
||||||
|
%idv1 = bitcast $3 $5 to <1 x $3>
|
||||||
|
%idvec = shufflevector <1 x $3> %idv1, <1 x $3> undef,
|
||||||
|
<$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
|
||||||
|
%notmask = xor <$1 x $3> %mask, < forloop(i, 1, eval($1-1), `$3 -1, ') $3 -1 >
|
||||||
|
%idoff = and <$1 x $3> %idvec, %notmask
|
||||||
|
|
||||||
|
; and comptue the merged vector that holds the identity in the off lanes
|
||||||
|
%valp = or <$1 x $3> %valoff, %idoff
|
||||||
|
|
||||||
|
; now compute the local reduction (val0 op val1 op ... )--initialize
|
||||||
|
; %eltvec so that the 0th element is the identity, the first is val0,
|
||||||
|
; the second is (val0 op val1), ..
|
||||||
|
%red0 = extractelement <$1 x $3> %valp, i32 0
|
||||||
|
%eltvec0 = insertelement <$1 x $3> undef, $3 $5, i32 0
|
||||||
|
|
||||||
|
forloop(i, 1, eval($1-1), `
|
||||||
|
%elt`'i = extractelement <$1 x $3> %valp, i32 i
|
||||||
|
%red`'i = $2 $3 %red`'eval(i-1), %elt`'i
|
||||||
|
%eltvec`'i = insertelement <$1 x $3> %eltvec`'eval(i-1), $3 %red`'eval(i-1), i32 i')
|
||||||
|
|
||||||
|
; make the atomic call, passing it the final reduced value
|
||||||
|
%final0 = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %red`'eval($1-1))
|
||||||
|
|
||||||
|
; now go back and compute the values to be returned for each program
|
||||||
|
; instance--this just involves smearing the old value returned from the
|
||||||
|
; actual atomic call across the vector and applying the vector op to the
|
||||||
|
; %eltvec vector computed above..
|
||||||
|
%finalv1 = bitcast $3 %final0 to <1 x $3>
|
||||||
|
%final_base = shufflevector <1 x $3> %finalv1, <1 x $3> undef,
|
||||||
|
<$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
|
||||||
|
%r = $2 <$1 x $3> %final_base, %eltvec`'eval($1-1)
|
||||||
|
|
||||||
|
ret <$1 x $3> %r
|
||||||
|
}
|
||||||
|
')
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; global_atomic_uniform
|
||||||
;; Defines the implementation of a function that handles the mapping from
|
;; Defines the implementation of a function that handles the mapping from
|
||||||
;; an ispc atomic function to the underlying LLVM intrinsics. Specifically,
|
;; an ispc atomic function to the underlying LLVM intrinsics. This variant
|
||||||
;; the function handles loooping over the active lanes, calling the underlying
|
;; just calls the atomic once, for the given uniform value
|
||||||
;; scalar atomic intrinsic for each one, and assembling the vector result.
|
|
||||||
;;
|
;;
|
||||||
;; Takes four parameters:
|
;; Takes four parameters:
|
||||||
;; $1: vector width of the target
|
;; $1: vector width of the target
|
||||||
@@ -482,23 +715,14 @@ forloop(i, 1, eval($1-1), `
|
|||||||
;; $3: return type of the LLVM atomic (e.g. i32)
|
;; $3: return type of the LLVM atomic (e.g. i32)
|
||||||
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||||
|
|
||||||
define(`global_atomic', `
|
define(`global_atomic_uniform', `
|
||||||
|
|
||||||
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
|
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
|
||||||
|
|
||||||
define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
define internal $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
|
||||||
<$1 x i32> %mask) nounwind alwaysinline {
|
<$1 x i32> %mask) nounwind alwaysinline {
|
||||||
%rptr = alloca <$1 x $3>
|
%r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
|
||||||
%rptr32 = bitcast <$1 x $3> * %rptr to $3 *
|
ret $3 %r
|
||||||
|
|
||||||
per_lane($1, <$1 x i32> %mask, `
|
|
||||||
%v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE
|
|
||||||
%r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID)
|
|
||||||
%rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE
|
|
||||||
store $3 %r_LANE_ID, $3 * %rp_LANE_ID')
|
|
||||||
|
|
||||||
%r = load <$1 x $3> * %rptr
|
|
||||||
ret <$1 x $3> %r
|
|
||||||
}
|
}
|
||||||
')
|
')
|
||||||
|
|
||||||
@@ -508,9 +732,10 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
|||||||
;; $2: llvm type of the vector elements (e.g. i32)
|
;; $2: llvm type of the vector elements (e.g. i32)
|
||||||
;; $3: ispc type of the elements (e.g. int32)
|
;; $3: ispc type of the elements (e.g. int32)
|
||||||
|
|
||||||
define(`global_swap', `
|
declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
|
||||||
|
declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)
|
||||||
|
|
||||||
declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
|
define(`global_swap', `
|
||||||
|
|
||||||
define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
|
define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
|
||||||
<$1 x i32> %mask) nounwind alwaysinline {
|
<$1 x i32> %mask) nounwind alwaysinline {
|
||||||
@@ -526,6 +751,12 @@ define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
|
|||||||
%r = load <$1 x $2> * %rptr
|
%r = load <$1 x $2> * %rptr
|
||||||
ret <$1 x $2> %r
|
ret <$1 x $2> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
|
||||||
|
<$1 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
|
||||||
|
ret $2 %r
|
||||||
|
}
|
||||||
')
|
')
|
||||||
|
|
||||||
|
|
||||||
@@ -555,6 +786,12 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $
|
|||||||
%r = load <$1 x $2> * %rptr
|
%r = load <$1 x $2> * %rptr
|
||||||
ret <$1 x $2> %r
|
ret <$1 x $2> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
|
||||||
|
$2 %val, <$1 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)
|
||||||
|
ret $2 %r
|
||||||
|
}
|
||||||
')
|
')
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
@@ -595,10 +832,11 @@ define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {
|
|||||||
|
|
||||||
define(`stdlib_core', `
|
define(`stdlib_core', `
|
||||||
|
|
||||||
declare i8* @ISPCMalloc(i64, i32) nounwind
|
declare i32 @__fast_masked_vload()
|
||||||
declare i8* @ISPCFree(i8*) nounwind
|
|
||||||
declare void @ISPCLaunch(i8*, i8*) nounwind
|
declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
|
||||||
declare void @ISPCSync() nounwind
|
declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
|
||||||
|
declare void @ISPCSync(i8*) nounwind
|
||||||
declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
|
declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
|
||||||
|
|
||||||
declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
|
declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
|
||||||
@@ -965,25 +1203,35 @@ define internal void @__memory_barrier() nounwind readnone alwaysinline {
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
global_atomic($1, add, i32, int32)
|
global_atomic_associative($1, add, i32, int32, 0)
|
||||||
global_atomic($1, sub, i32, int32)
|
global_atomic_associative($1, sub, i32, int32, 0)
|
||||||
global_atomic($1, and, i32, int32)
|
global_atomic_associative($1, and, i32, int32, -1)
|
||||||
global_atomic($1, or, i32, int32)
|
global_atomic_associative($1, or, i32, int32, 0)
|
||||||
global_atomic($1, xor, i32, int32)
|
global_atomic_associative($1, xor, i32, int32, 0)
|
||||||
global_atomic($1, min, i32, int32)
|
global_atomic_uniform($1, add, i32, int32)
|
||||||
global_atomic($1, max, i32, int32)
|
global_atomic_uniform($1, sub, i32, int32)
|
||||||
global_atomic($1, umin, i32, uint32)
|
global_atomic_uniform($1, and, i32, int32)
|
||||||
global_atomic($1, umax, i32, uint32)
|
global_atomic_uniform($1, or, i32, int32)
|
||||||
|
global_atomic_uniform($1, xor, i32, int32)
|
||||||
|
global_atomic_uniform($1, min, i32, int32)
|
||||||
|
global_atomic_uniform($1, max, i32, int32)
|
||||||
|
global_atomic_uniform($1, umin, i32, uint32)
|
||||||
|
global_atomic_uniform($1, umax, i32, uint32)
|
||||||
|
|
||||||
global_atomic($1, add, i64, int64)
|
global_atomic_associative($1, add, i64, int64, 0)
|
||||||
global_atomic($1, sub, i64, int64)
|
global_atomic_associative($1, sub, i64, int64, 0)
|
||||||
global_atomic($1, and, i64, int64)
|
global_atomic_associative($1, and, i64, int64, -1)
|
||||||
global_atomic($1, or, i64, int64)
|
global_atomic_associative($1, or, i64, int64, 0)
|
||||||
global_atomic($1, xor, i64, int64)
|
global_atomic_associative($1, xor, i64, int64, 0)
|
||||||
global_atomic($1, min, i64, int64)
|
global_atomic_uniform($1, add, i64, int64)
|
||||||
global_atomic($1, max, i64, int64)
|
global_atomic_uniform($1, sub, i64, int64)
|
||||||
global_atomic($1, umin, i64, uint64)
|
global_atomic_uniform($1, and, i64, int64)
|
||||||
global_atomic($1, umax, i64, uint64)
|
global_atomic_uniform($1, or, i64, int64)
|
||||||
|
global_atomic_uniform($1, xor, i64, int64)
|
||||||
|
global_atomic_uniform($1, min, i64, int64)
|
||||||
|
global_atomic_uniform($1, max, i64, int64)
|
||||||
|
global_atomic_uniform($1, umin, i64, uint64)
|
||||||
|
global_atomic_uniform($1, umax, i64, uint64)
|
||||||
|
|
||||||
global_swap($1, i32, int32)
|
global_swap($1, i32, int32)
|
||||||
global_swap($1, i64, int64)
|
global_swap($1, i64, int64)
|
||||||
@@ -1006,6 +1254,24 @@ define internal <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x
|
|||||||
ret <$1 x double> %ret
|
ret <$1 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
|
||||||
|
<$1 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%iptr = bitcast float * %ptr to i32 *
|
||||||
|
%ival = bitcast float %val to i32
|
||||||
|
%iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask)
|
||||||
|
%ret = bitcast i32 %iret to float
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
|
||||||
|
<$1 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%iptr = bitcast double * %ptr to i64 *
|
||||||
|
%ival = bitcast double %val to i64
|
||||||
|
%iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask)
|
||||||
|
%ret = bitcast i64 %iret to double
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
global_atomic_exchange($1, i32, int32)
|
global_atomic_exchange($1, i32, int32)
|
||||||
global_atomic_exchange($1, i64, int64)
|
global_atomic_exchange($1, i64, int64)
|
||||||
|
|
||||||
@@ -1030,6 +1296,29 @@ define internal <$1 x double> @__atomic_compare_exchange_double_global(double *
|
|||||||
%ret = bitcast <$1 x i64> %iret to <$1 x double>
|
%ret = bitcast <$1 x i64> %iret to <$1 x double>
|
||||||
ret <$1 x double> %ret
|
ret <$1 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
|
||||||
|
<$1 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%iptr = bitcast float * %ptr to i32 *
|
||||||
|
%icmp = bitcast float %cmp to i32
|
||||||
|
%ival = bitcast float %val to i32
|
||||||
|
%iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
|
||||||
|
i32 %ival, <$1 x i32> %mask)
|
||||||
|
%ret = bitcast i32 %iret to float
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
|
||||||
|
double %val, <$1 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%iptr = bitcast double * %ptr to i64 *
|
||||||
|
%icmp = bitcast double %cmp to i64
|
||||||
|
%ival = bitcast double %val to i64
|
||||||
|
%iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
|
||||||
|
i64 %ival, <$1 x i32> %mask)
|
||||||
|
%ret = bitcast i64 %iret to double
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
')
|
')
|
||||||
|
|
||||||
|
|
||||||
@@ -1088,12 +1377,6 @@ i64minmax($1,max,uint64,ugt)
|
|||||||
|
|
||||||
define(`load_and_broadcast', `
|
define(`load_and_broadcast', `
|
||||||
define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
|
define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
|
||||||
; must not load if the mask is all off; the address may be invalid
|
|
||||||
%mm = call i32 @__movmsk(<$1 x i32> %mask)
|
|
||||||
%any_on = icmp ne i32 %mm, 0
|
|
||||||
br i1 %any_on, label %load, label %skip
|
|
||||||
|
|
||||||
load:
|
|
||||||
%ptr = bitcast i8 * %0 to $2 *
|
%ptr = bitcast i8 * %0 to $2 *
|
||||||
%val = load $2 * %ptr
|
%val = load $2 * %ptr
|
||||||
|
|
||||||
@@ -1101,9 +1384,6 @@ load:
|
|||||||
forloop(i, 1, eval($1-1), `
|
forloop(i, 1, eval($1-1), `
|
||||||
%ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
|
%ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
|
||||||
ret <$1 x $2> %ret`'eval($1-1)
|
ret <$1 x $2> %ret`'eval($1-1)
|
||||||
|
|
||||||
skip:
|
|
||||||
ret <$1 x $2> undef
|
|
||||||
}
|
}
|
||||||
')
|
')
|
||||||
|
|
||||||
@@ -1119,14 +1399,20 @@ define(`load_masked', `
|
|||||||
define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
|
define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
|
||||||
entry:
|
entry:
|
||||||
%mm = call i32 @__movmsk(<$1 x i32> %mask)
|
%mm = call i32 @__movmsk(<$1 x i32> %mask)
|
||||||
|
|
||||||
; if the first lane and the last lane are on, then it is safe to do a vector load
|
; if the first lane and the last lane are on, then it is safe to do a vector load
|
||||||
; of the whole thing--what the lanes in the middle want turns out to not matter...
|
; of the whole thing--what the lanes in the middle want turns out to not matter...
|
||||||
%mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
|
%mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
|
||||||
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
|
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
|
||||||
|
|
||||||
|
%fast32 = call i32 @__fast_masked_vload()
|
||||||
|
%fast_i1 = trunc i32 %fast32 to i1
|
||||||
|
%can_vload_maybe_fast = or i1 %fast_i1, %can_vload
|
||||||
|
|
||||||
; if we are not able to do a singe vload, we will accumulate lanes in this memory..
|
; if we are not able to do a singe vload, we will accumulate lanes in this memory..
|
||||||
%retptr = alloca <$1 x $2>
|
%retptr = alloca <$1 x $2>
|
||||||
%retptr32 = bitcast <$1 x $2> * %retptr to $2 *
|
%retptr32 = bitcast <$1 x $2> * %retptr to $2 *
|
||||||
br i1 %can_vload, label %load, label %loop
|
br i1 %can_vload_maybe_fast, label %load, label %loop
|
||||||
|
|
||||||
load:
|
load:
|
||||||
%ptr = bitcast i8 * %0 to <$1 x $2> *
|
%ptr = bitcast i8 * %0 to <$1 x $2> *
|
||||||
@@ -1261,6 +1547,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
|
|||||||
')
|
')
|
||||||
|
|
||||||
|
|
||||||
|
define(`masked_store_blend_8_16_by_16', `
|
||||||
|
define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
|
||||||
|
<16 x i32>) nounwind alwaysinline {
|
||||||
|
%old = load <16 x i8> * %0
|
||||||
|
%old128 = bitcast <16 x i8> %old to i128
|
||||||
|
%new128 = bitcast <16 x i8> %1 to i128
|
||||||
|
|
||||||
|
%mask8 = trunc <16 x i32> %2 to <16 x i8>
|
||||||
|
%mask128 = bitcast <16 x i8> %mask8 to i128
|
||||||
|
%notmask128 = xor i128 %mask128, -1
|
||||||
|
|
||||||
|
%newmasked = and i128 %new128, %mask128
|
||||||
|
%oldmasked = and i128 %old128, %notmask128
|
||||||
|
%result = or i128 %newmasked, %oldmasked
|
||||||
|
|
||||||
|
%resultvec = bitcast i128 %result to <16 x i8>
|
||||||
|
store <16 x i8> %resultvec, <16 x i8> * %0
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
|
||||||
|
<16 x i32>) nounwind alwaysinline {
|
||||||
|
%old = load <16 x i16> * %0
|
||||||
|
%old256 = bitcast <16 x i16> %old to i256
|
||||||
|
%new256 = bitcast <16 x i16> %1 to i256
|
||||||
|
|
||||||
|
%mask16 = trunc <16 x i32> %2 to <16 x i16>
|
||||||
|
%mask256 = bitcast <16 x i16> %mask16 to i256
|
||||||
|
%notmask256 = xor i256 %mask256, -1
|
||||||
|
|
||||||
|
%newmasked = and i256 %new256, %mask256
|
||||||
|
%oldmasked = and i256 %old256, %notmask256
|
||||||
|
%result = or i256 %newmasked, %oldmasked
|
||||||
|
|
||||||
|
%resultvec = bitcast i256 %result to <16 x i16>
|
||||||
|
store <16 x i16> %resultvec, <16 x i16> * %0
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
')
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; packed load and store functions
|
;; packed load and store functions
|
||||||
;;
|
;;
|
||||||
@@ -1288,7 +1614,7 @@ entry:
|
|||||||
|
|
||||||
known_mask:
|
known_mask:
|
||||||
%allon = icmp eq i32 %mask, eval((1 << $1) -1)
|
%allon = icmp eq i32 %mask, eval((1 << $1) -1)
|
||||||
br i1 %allon, label %all_on, label %not_all_on
|
br i1 %allon, label %all_on, label %unknown_mask
|
||||||
|
|
||||||
all_on:
|
all_on:
|
||||||
;; everyone wants to load, so just load an entire vector width in a single
|
;; everyone wants to load, so just load an entire vector width in a single
|
||||||
@@ -1298,14 +1624,6 @@ all_on:
|
|||||||
store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
|
store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
|
||||||
ret i32 $1
|
ret i32 $1
|
||||||
|
|
||||||
not_all_on:
|
|
||||||
%alloff = icmp eq i32 %mask, 0
|
|
||||||
br i1 %alloff, label %all_off, label %unknown_mask
|
|
||||||
|
|
||||||
all_off:
|
|
||||||
;; no one wants to load
|
|
||||||
ret i32 0
|
|
||||||
|
|
||||||
unknown_mask:
|
unknown_mask:
|
||||||
br label %loop
|
br label %loop
|
||||||
|
|
||||||
@@ -1352,20 +1670,13 @@ entry:
|
|||||||
|
|
||||||
known_mask:
|
known_mask:
|
||||||
%allon = icmp eq i32 %mask, eval((1 << $1) -1)
|
%allon = icmp eq i32 %mask, eval((1 << $1) -1)
|
||||||
br i1 %allon, label %all_on, label %not_all_on
|
br i1 %allon, label %all_on, label %unknown_mask
|
||||||
|
|
||||||
all_on:
|
all_on:
|
||||||
%vecptr = bitcast i32 *%startptr to <$1 x i32> *
|
%vecptr = bitcast i32 *%startptr to <$1 x i32> *
|
||||||
store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
|
store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
|
||||||
ret i32 $1
|
ret i32 $1
|
||||||
|
|
||||||
not_all_on:
|
|
||||||
%alloff = icmp eq i32 %mask, 0
|
|
||||||
br i1 %alloff, label %all_off, label %unknown_mask
|
|
||||||
|
|
||||||
all_off:
|
|
||||||
ret i32 0
|
|
||||||
|
|
||||||
unknown_mask:
|
unknown_mask:
|
||||||
br label %loop
|
br label %loop
|
||||||
|
|
||||||
@@ -1415,14 +1726,6 @@ entry:
|
|||||||
br i1 %allon, label %check_neighbors, label %domixed
|
br i1 %allon, label %check_neighbors, label %domixed
|
||||||
|
|
||||||
domixed:
|
domixed:
|
||||||
; the mask is mixed on/off. First see if the lanes are all off
|
|
||||||
%alloff = icmp eq i32 %mm, 0
|
|
||||||
br i1 %alloff, label %doalloff, label %actuallymixed
|
|
||||||
|
|
||||||
doalloff:
|
|
||||||
ret i1 false ;; this seems safest
|
|
||||||
|
|
||||||
actuallymixed:
|
|
||||||
; First, figure out which lane is the first active one
|
; First, figure out which lane is the first active one
|
||||||
%first = call i32 @llvm.cttz.i32(i32 %mm)
|
%first = call i32 @llvm.cttz.i32(i32 %mm)
|
||||||
%baseval = extractelement <$1 x $2> %v, i32 %first
|
%baseval = extractelement <$1 x $2> %v, i32 %first
|
||||||
@@ -1445,7 +1748,7 @@ actuallymixed:
|
|||||||
br label %check_neighbors
|
br label %check_neighbors
|
||||||
|
|
||||||
check_neighbors:
|
check_neighbors:
|
||||||
%vec = phi <$1 x $2> [ %blendvec, %actuallymixed ], [ %v, %entry ]
|
%vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
|
||||||
ifelse($6, `32', `
|
ifelse($6, `32', `
|
||||||
; For 32-bit elements, we rotate once and compare with the vector, which ends
|
; For 32-bit elements, we rotate once and compare with the vector, which ends
|
||||||
; up comparing each element to its neighbor on the right. Then see if
|
; up comparing each element to its neighbor on the right. Then see if
|
||||||
@@ -1577,7 +1880,7 @@ pl_known_mask:
|
|||||||
;; the mask is known at compile time; see if it is something we can
|
;; the mask is known at compile time; see if it is something we can
|
||||||
;; handle more efficiently
|
;; handle more efficiently
|
||||||
%pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
|
%pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
|
||||||
br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
|
br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
|
||||||
|
|
||||||
pl_all_on:
|
pl_all_on:
|
||||||
;; the mask is all on--just expand the code for each lane sequentially
|
;; the mask is all on--just expand the code for each lane sequentially
|
||||||
@@ -1585,19 +1888,14 @@ pl_all_on:
|
|||||||
`patsubst(`$3', `ID\|LANE', i)')
|
`patsubst(`$3', `ID\|LANE', i)')
|
||||||
br label %pl_done
|
br label %pl_done
|
||||||
|
|
||||||
pl_not_all_on:
|
pl_unknown_mask:
|
||||||
;; not all on--see if it is all off or mixed
|
;; we just run the general case, though we could
|
||||||
;; for the mixed case, we just run the general case, though we could
|
|
||||||
;; try to be smart and just emit the code based on what it actually is,
|
;; try to be smart and just emit the code based on what it actually is,
|
||||||
;; for example by emitting the code straight-line without a loop and doing
|
;; for example by emitting the code straight-line without a loop and doing
|
||||||
;; the lane tests explicitly, leaving later optimization passes to eliminate
|
;; the lane tests explicitly, leaving later optimization passes to eliminate
|
||||||
;; the stuff that is definitely not needed. Not clear if we will frequently
|
;; the stuff that is definitely not needed. Not clear if we will frequently
|
||||||
;; encounter a mask that is known at compile-time but is not either all on or
|
;; encounter a mask that is known at compile-time but is not either all on or
|
||||||
;; all off...
|
;; all off...
|
||||||
%pl_alloff = icmp eq i32 %pl_mask, 0
|
|
||||||
br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
|
|
||||||
|
|
||||||
pl_unknown_mask:
|
|
||||||
br label %pl_loop
|
br label %pl_loop
|
||||||
|
|
||||||
pl_loop:
|
pl_loop:
|
||||||
@@ -1653,20 +1951,6 @@ define internal <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x
|
|||||||
|
|
||||||
define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
|
define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
|
||||||
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||||
entry:
|
|
||||||
%mask = call i32 @__movmsk(<$1 x i32> %vecmask)
|
|
||||||
|
|
||||||
%maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
|
|
||||||
br i1 %maskKnown, label %known_mask, label %unknown_mask
|
|
||||||
|
|
||||||
known_mask:
|
|
||||||
%alloff = icmp eq i32 %mask, 0
|
|
||||||
br i1 %alloff, label %gather_all_off, label %unknown_mask
|
|
||||||
|
|
||||||
gather_all_off:
|
|
||||||
ret <$1 x $2> undef
|
|
||||||
|
|
||||||
unknown_mask:
|
|
||||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||||
; to require that the 0th element of the array being gathered from is always
|
; to require that the 0th element of the array being gathered from is always
|
||||||
; legal to read from (and we do indeed require that, given the benefits!)
|
; legal to read from (and we do indeed require that, given the benefits!)
|
||||||
|
|||||||
132
ctx.cpp
132
ctx.cpp
@@ -144,6 +144,11 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
|||||||
returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
|
returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
|
||||||
StoreInst(LLVMMaskAllOff, returnedLanesPtr);
|
StoreInst(LLVMMaskAllOff, returnedLanesPtr);
|
||||||
|
|
||||||
|
launchedTasks = false;
|
||||||
|
launchGroupHandlePtr = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle");
|
||||||
|
StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType),
|
||||||
|
launchGroupHandlePtr);
|
||||||
|
|
||||||
if (!returnType || returnType == AtomicType::Void)
|
if (!returnType || returnType == AtomicType::Void)
|
||||||
returnValuePtr = NULL;
|
returnValuePtr = NULL;
|
||||||
else {
|
else {
|
||||||
@@ -153,7 +158,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
|||||||
StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
|
StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef LLVM_2_8
|
|
||||||
if (m->diBuilder) {
|
if (m->diBuilder) {
|
||||||
/* If debugging is enabled, tell the debug information emission
|
/* If debugging is enabled, tell the debug information emission
|
||||||
code about this new function */
|
code about this new function */
|
||||||
@@ -174,16 +178,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
|||||||
/* And start a scope representing the initial function scope */
|
/* And start a scope representing the initial function scope */
|
||||||
StartScope();
|
StartScope();
|
||||||
}
|
}
|
||||||
#endif // LLVM_2_8
|
|
||||||
|
|
||||||
launchedTasks = false;
|
|
||||||
|
|
||||||
// connect the funciton's mask memory to the __mask symbol
|
// connect the funciton's mask memory to the __mask symbol
|
||||||
Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
|
Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
|
||||||
assert(maskSymbol != NULL);
|
assert(maskSymbol != NULL);
|
||||||
maskSymbol->storagePtr = maskPtr;
|
maskSymbol->storagePtr = maskPtr;
|
||||||
|
|
||||||
#ifndef LLVM_2_8
|
|
||||||
// add debugging info for __mask, programIndex, ...
|
// add debugging info for __mask, programIndex, ...
|
||||||
if (m->diBuilder) {
|
if (m->diBuilder) {
|
||||||
maskSymbol->pos = funcStartPos;
|
maskSymbol->pos = funcStartPos;
|
||||||
@@ -208,15 +208,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
|||||||
true /* static */,
|
true /* static */,
|
||||||
programCountSymbol->storagePtr);
|
programCountSymbol->storagePtr);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
FunctionEmitContext::~FunctionEmitContext() {
|
FunctionEmitContext::~FunctionEmitContext() {
|
||||||
assert(controlFlowInfo.size() == 0);
|
assert(controlFlowInfo.size() == 0);
|
||||||
#ifndef LLVM_2_8
|
|
||||||
assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
|
assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -704,6 +701,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
|
|||||||
|
|
||||||
llvm::Value *
|
llvm::Value *
|
||||||
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||||
|
#if 0
|
||||||
// Compare the two masks to get a vector of i1s
|
// Compare the two masks to get a vector of i1s
|
||||||
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||||
v1, v2, "v1==v2");
|
v1, v2, "v1==v2");
|
||||||
@@ -711,6 +709,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
|||||||
cmp = I1VecToBoolVec(cmp);
|
cmp = I1VecToBoolVec(cmp);
|
||||||
// And see if it's all on
|
// And see if it's all on
|
||||||
return All(cmp);
|
return All(cmp);
|
||||||
|
#else
|
||||||
|
llvm::Value *mm1 = LaneMask(v1);
|
||||||
|
llvm::Value *mm2 = LaneMask(v2);
|
||||||
|
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
|
||||||
|
"v1==v2");
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -758,7 +762,7 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
|
|||||||
|
|
||||||
|
|
||||||
llvm::Value *
|
llvm::Value *
|
||||||
FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
|
FunctionEmitContext::SizeOf(LLVM_TYPE_CONST llvm::Type *ty) {
|
||||||
// Emit code to compute the size of the given type using a GEP with a
|
// Emit code to compute the size of the given type using a GEP with a
|
||||||
// NULL base pointer, indexing one element of the given type, and
|
// NULL base pointer, indexing one element of the given type, and
|
||||||
// casting the resulting 'pointer' to an int giving its size.
|
// casting the resulting 'pointer' to an int giving its size.
|
||||||
@@ -775,24 +779,7 @@ FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
|
|||||||
#endif
|
#endif
|
||||||
AddDebugPos(poffset);
|
AddDebugPos(poffset);
|
||||||
llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
|
llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
|
||||||
|
return sizeOf;
|
||||||
// And given the size, call the malloc function
|
|
||||||
llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
|
|
||||||
assert(fmalloc != NULL);
|
|
||||||
llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align),
|
|
||||||
"raw_argmem");
|
|
||||||
// Cast the void * back to the result pointer type
|
|
||||||
return BitCastInst(mem, ptrType, "mem_bitcast");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void
|
|
||||||
FunctionEmitContext::EmitFree(llvm::Value *ptr) {
|
|
||||||
llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType,
|
|
||||||
"argmemfree");
|
|
||||||
llvm::Function *ffree = m->module->getFunction("ISPCFree");
|
|
||||||
assert(ffree != NULL);
|
|
||||||
CallInst(ffree, freeArg);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -850,7 +837,6 @@ FunctionEmitContext::GetDebugPos() const {
|
|||||||
void
|
void
|
||||||
FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
|
FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
|
||||||
llvm::DIScope *scope) {
|
llvm::DIScope *scope) {
|
||||||
#ifndef LLVM_2_8
|
|
||||||
llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
|
llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
|
||||||
if (inst != NULL && m->diBuilder) {
|
if (inst != NULL && m->diBuilder) {
|
||||||
SourcePos p = pos ? *pos : currentPos;
|
SourcePos p = pos ? *pos : currentPos;
|
||||||
@@ -861,13 +847,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
|
|||||||
inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column,
|
inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column,
|
||||||
scope ? *scope : GetDIScope()));
|
scope ? *scope : GetDIScope()));
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
FunctionEmitContext::StartScope() {
|
FunctionEmitContext::StartScope() {
|
||||||
#ifndef LLVM_2_8
|
|
||||||
if (m->diBuilder != NULL) {
|
if (m->diBuilder != NULL) {
|
||||||
llvm::DIScope parentScope;
|
llvm::DIScope parentScope;
|
||||||
if (debugScopes.size() > 0)
|
if (debugScopes.size() > 0)
|
||||||
@@ -881,18 +865,15 @@ FunctionEmitContext::StartScope() {
|
|||||||
currentPos.first_column);
|
currentPos.first_column);
|
||||||
debugScopes.push_back(lexicalBlock);
|
debugScopes.push_back(lexicalBlock);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
FunctionEmitContext::EndScope() {
|
FunctionEmitContext::EndScope() {
|
||||||
#ifndef LLVM_2_8
|
|
||||||
if (m->diBuilder != NULL) {
|
if (m->diBuilder != NULL) {
|
||||||
assert(debugScopes.size() > 0);
|
assert(debugScopes.size() > 0);
|
||||||
debugScopes.pop_back();
|
debugScopes.pop_back();
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -905,7 +886,6 @@ FunctionEmitContext::GetDIScope() const {
|
|||||||
|
|
||||||
void
|
void
|
||||||
FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
|
FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
|
||||||
#ifndef LLVM_2_8
|
|
||||||
if (m->diBuilder == NULL)
|
if (m->diBuilder == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -921,13 +901,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
|
|||||||
llvm::Instruction *declareInst =
|
llvm::Instruction *declareInst =
|
||||||
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
|
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
|
||||||
AddDebugPos(declareInst, &sym->pos, &scope);
|
AddDebugPos(declareInst, &sym->pos, &scope);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
|
FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
|
||||||
#ifndef LLVM_2_8
|
|
||||||
if (m->diBuilder == NULL)
|
if (m->diBuilder == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -943,7 +921,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
|
|||||||
llvm::Instruction *declareInst =
|
llvm::Instruction *declareInst =
|
||||||
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
|
m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
|
||||||
AddDebugPos(declareInst, &sym->pos, &scope);
|
AddDebugPos(declareInst, &sym->pos, &scope);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1501,27 +1478,15 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
|
|||||||
void
|
void
|
||||||
FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
|
FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
|
||||||
llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
|
llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
|
||||||
#ifdef LLVM_2_8
|
|
||||||
llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
|
|
||||||
#else
|
|
||||||
llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
|
llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
|
||||||
#endif
|
|
||||||
inst->setMetadata("filename", md);
|
inst->setMetadata("filename", md);
|
||||||
|
|
||||||
llvm::Value *line = LLVMInt32(pos.first_line);
|
llvm::Value *line = LLVMInt32(pos.first_line);
|
||||||
#ifdef LLVM_2_8
|
|
||||||
md = llvm::MDNode::get(*g->ctx, &line, 1);
|
|
||||||
#else
|
|
||||||
md = llvm::MDNode::get(*g->ctx, line);
|
md = llvm::MDNode::get(*g->ctx, line);
|
||||||
#endif
|
|
||||||
inst->setMetadata("line", md);
|
inst->setMetadata("line", md);
|
||||||
|
|
||||||
llvm::Value *column = LLVMInt32(pos.first_column);
|
llvm::Value *column = LLVMInt32(pos.first_column);
|
||||||
#ifdef LLVM_2_8
|
|
||||||
md = llvm::MDNode::get(*g->ctx, &column, 1);
|
|
||||||
#else
|
|
||||||
md = llvm::MDNode::get(*g->ctx, column);
|
md = llvm::MDNode::get(*g->ctx, column);
|
||||||
#endif
|
|
||||||
inst->setMetadata("column", md);
|
inst->setMetadata("column", md);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1838,9 +1803,9 @@ llvm::PHINode *
|
|||||||
FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
||||||
const char *name) {
|
const char *name) {
|
||||||
llvm::PHINode *pn = llvm::PHINode::Create(type,
|
llvm::PHINode *pn = llvm::PHINode::Create(type,
|
||||||
#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
count,
|
count,
|
||||||
#endif // !LLVM_2_8 && !LLVM_2_9
|
#endif // LLVM_3_0
|
||||||
name ? name : "phi", bblock);
|
name ? name : "phi", bblock);
|
||||||
AddDebugPos(pn);
|
AddDebugPos(pn);
|
||||||
return pn;
|
return pn;
|
||||||
@@ -1933,15 +1898,9 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
|
|||||||
|
|
||||||
llvm::Instruction *
|
llvm::Instruction *
|
||||||
FunctionEmitContext::ReturnInst() {
|
FunctionEmitContext::ReturnInst() {
|
||||||
if (launchedTasks) {
|
if (launchedTasks)
|
||||||
// Automatically add a sync call at the end of any function that
|
// Add a sync call at the end of any function that launched tasks
|
||||||
// launched tasks
|
SyncInst();
|
||||||
SourcePos noPos;
|
|
||||||
noPos.name = "__auto_sync";
|
|
||||||
ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos);
|
|
||||||
es->EmitCode(this);
|
|
||||||
delete es;
|
|
||||||
}
|
|
||||||
|
|
||||||
llvm::Instruction *rinst = NULL;
|
llvm::Instruction *rinst = NULL;
|
||||||
if (returnValuePtr != NULL) {
|
if (returnValuePtr != NULL) {
|
||||||
@@ -1964,7 +1923,8 @@ FunctionEmitContext::ReturnInst() {
|
|||||||
|
|
||||||
llvm::Instruction *
|
llvm::Instruction *
|
||||||
FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
||||||
std::vector<llvm::Value *> &argVals) {
|
std::vector<llvm::Value *> &argVals,
|
||||||
|
llvm::Value *launchCount) {
|
||||||
if (callee == NULL) {
|
if (callee == NULL) {
|
||||||
assert(m->errorCount > 0);
|
assert(m->errorCount > 0);
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -1981,20 +1941,15 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
|||||||
static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
|
static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
|
||||||
assert(argStructType->getNumElements() == argVals.size() + 1);
|
assert(argStructType->getNumElements() == argVals.size() + 1);
|
||||||
|
|
||||||
|
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||||
|
assert(falloc != NULL);
|
||||||
int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
|
int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
|
||||||
#ifdef ISPC_IS_WINDOWS
|
std::vector<llvm::Value *> allocArgs;
|
||||||
// Use malloc() to allocate storage on Windows, since the stack is
|
allocArgs.push_back(launchGroupHandlePtr);
|
||||||
// generally not big enough there to do enough allocations for lots of
|
allocArgs.push_back(SizeOf(argStructType));
|
||||||
// tasks and then things crash horribly...
|
allocArgs.push_back(LLVMInt32(align));
|
||||||
llvm::Value *argmem = EmitMalloc(argStructType, align);
|
llvm::Value *voidmem = CallInst(falloc, allocArgs, "args_ptr");
|
||||||
#else
|
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||||
// Use alloca for space for the task args on OSX And Linux. KEY
|
|
||||||
// DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so
|
|
||||||
// that the alloca doesn't happen just once at the top of the function,
|
|
||||||
// but happens each time the enclosing basic block executes.
|
|
||||||
llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
|
|
||||||
#endif // ISPC_IS_WINDOWS
|
|
||||||
llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
|
|
||||||
|
|
||||||
// Copy the values of the parameters into the appropriate place in
|
// Copy the values of the parameters into the appropriate place in
|
||||||
// the argument block
|
// the argument block
|
||||||
@@ -2016,5 +1971,32 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
|||||||
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||||
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||||
assert(flaunch != NULL);
|
assert(flaunch != NULL);
|
||||||
return CallInst(flaunch, fptr, voidmem, "");
|
std::vector<llvm::Value *> args;
|
||||||
|
args.push_back(launchGroupHandlePtr);
|
||||||
|
args.push_back(fptr);
|
||||||
|
args.push_back(voidmem);
|
||||||
|
args.push_back(launchCount);
|
||||||
|
return CallInst(flaunch, args, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
FunctionEmitContext::SyncInst() {
|
||||||
|
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL);
|
||||||
|
llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||||
|
llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
|
||||||
|
llvm::CmpInst::ICMP_NE,
|
||||||
|
launchGroupHandle, nullPtrValue);
|
||||||
|
llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
|
||||||
|
llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
|
||||||
|
BranchInst(bSync, bPostSync, nonNull);
|
||||||
|
|
||||||
|
SetCurrentBasicBlock(bSync);
|
||||||
|
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||||
|
if (fsync == NULL)
|
||||||
|
FATAL("Couldn't find ISPCSync declaration?!");
|
||||||
|
CallInst(fsync, launchGroupHandle, "");
|
||||||
|
BranchInst(bPostSync);
|
||||||
|
|
||||||
|
SetCurrentBasicBlock(bPostSync);
|
||||||
}
|
}
|
||||||
|
|||||||
21
ctx.h
21
ctx.h
@@ -210,15 +210,8 @@ public:
|
|||||||
i32. */
|
i32. */
|
||||||
llvm::Value *I1VecToBoolVec(llvm::Value *b);
|
llvm::Value *I1VecToBoolVec(llvm::Value *b);
|
||||||
|
|
||||||
/** Emit code to call the user-supplied ISPCMalloc function to
|
/** Returns the size of the given type. */
|
||||||
allocate space for an object of thee given type. Returns the
|
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
|
||||||
pointer value returned by the ISPCMalloc call. */
|
|
||||||
llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0);
|
|
||||||
|
|
||||||
/** Emit code to call the user-supplied ISPCFree function, passing it
|
|
||||||
the given pointer to storage previously allocated by an
|
|
||||||
EmitMalloc() call. */
|
|
||||||
void EmitFree(llvm::Value *ptr);
|
|
||||||
|
|
||||||
/** If the user has asked to compile the program with instrumentation,
|
/** If the user has asked to compile the program with instrumentation,
|
||||||
this inserts a callback to the user-supplied instrumentation
|
this inserts a callback to the user-supplied instrumentation
|
||||||
@@ -399,7 +392,10 @@ public:
|
|||||||
/** Launch an asynchronous task to run the given function, passing it
|
/** Launch an asynchronous task to run the given function, passing it
|
||||||
he given argument values. */
|
he given argument values. */
|
||||||
llvm::Instruction *LaunchInst(llvm::Function *callee,
|
llvm::Instruction *LaunchInst(llvm::Function *callee,
|
||||||
std::vector<llvm::Value *> &argVals);
|
std::vector<llvm::Value *> &argVals,
|
||||||
|
llvm::Value *launchCount);
|
||||||
|
|
||||||
|
void SyncInst();
|
||||||
|
|
||||||
llvm::Instruction *ReturnInst();
|
llvm::Instruction *ReturnInst();
|
||||||
/** @} */
|
/** @} */
|
||||||
@@ -489,6 +485,11 @@ private:
|
|||||||
/** True if a 'launch' statement has been encountered in the function. */
|
/** True if a 'launch' statement has been encountered in the function. */
|
||||||
bool launchedTasks;
|
bool launchedTasks;
|
||||||
|
|
||||||
|
/** This is a pointer to a void * that is passed to the ISPCLaunch(),
|
||||||
|
ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
|
||||||
|
tasks launched from the current function. */
|
||||||
|
llvm::Value *launchGroupHandlePtr;
|
||||||
|
|
||||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||||
static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
|
static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
|
||||||
bool ifsInLoopAllUniform() const;
|
bool ifsInLoopAllUniform() const;
|
||||||
|
|||||||
2
decl.cpp
2
decl.cpp
@@ -237,7 +237,7 @@ Declarator::GetType(DeclSpecs *ds) const {
|
|||||||
sprintf(buf, "__anon_parameter_%d", i);
|
sprintf(buf, "__anon_parameter_%d", i);
|
||||||
sym = new Symbol(buf, pos);
|
sym = new Symbol(buf, pos);
|
||||||
Declarator *declarator = new Declarator(sym, sym->pos);
|
Declarator *declarator = new Declarator(sym, sym->pos);
|
||||||
sym->type = declarator->GetType(ds);
|
sym->type = declarator->GetType(d->declSpecs);
|
||||||
d->declarators.push_back(declarator);
|
d->declarators.push_back(declarator);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|||||||
@@ -1,3 +1,88 @@
|
|||||||
|
=== v1.0.10 === (30 September 2011)
|
||||||
|
|
||||||
|
This release features an extensive new example showing the application of
|
||||||
|
ispc to a deferred shading algorithm for scenes with thousands of lights
|
||||||
|
(examples/deferred). This is an implementation of the algorithm that Johan
|
||||||
|
Andersson described at SIGGRAPH 2009 and was implemented by Andrew
|
||||||
|
Lauritzen and Jefferson Montgomery. The basic idea is that a pre-rendered
|
||||||
|
G-buffer is partitioned into tiles, and in each tile, the set of lights
|
||||||
|
that contribute to the tile is computed. Then, the pixels in the tile are
|
||||||
|
then shaded using those light sources. (See slides 19-29 of
|
||||||
|
http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
|
||||||
|
for more details on the algorithm.)
|
||||||
|
|
||||||
|
The mechanism for launching tasks from ispc code has been generalized to
|
||||||
|
allow multiple tasks to be launched with a single launch call (see
|
||||||
|
http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
|
||||||
|
information.)
|
||||||
|
|
||||||
|
A few new functions have been added to the standard library: num_cores()
|
||||||
|
returns the number of cores in the system's CPU, and variants of all of the
|
||||||
|
atomic operators that take 'uniform' values as parameters have been added.
|
||||||
|
|
||||||
|
=== v1.0.9 === (26 September 2011)
|
||||||
|
|
||||||
|
The binary release of v1.0.9 is the first that supports AVX code
|
||||||
|
generation. Two targets are provided: "avx", which runs with a
|
||||||
|
programCount of 8, and "avx-x2" which runs 16 program instances
|
||||||
|
simultaneously. (This binary is also built using the in-progress LLVM 3.0
|
||||||
|
development libraries, while previous ones have been built with the
|
||||||
|
released 2.9 version of LLVM.)
|
||||||
|
|
||||||
|
This release has no other significant changes beyond a number of small
|
||||||
|
bugfixes (https://github.com/ispc/ispc/issues/100,
|
||||||
|
https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
|
||||||
|
|
||||||
|
=== v1.0.8 === (19 September 2011)
|
||||||
|
|
||||||
|
A number of improvements have been made to handling of 'if' statements in
|
||||||
|
the language:
|
||||||
|
- A bug was fixed where invalid memory could be incorrectly accessed even
|
||||||
|
if none of the running program instances wanted to execute the
|
||||||
|
corresponding instructions (https://github.com/ispc/ispc/issues/74).
|
||||||
|
- The code generated for 'if' statements is a bit simpler and thus more
|
||||||
|
efficient.
|
||||||
|
|
||||||
|
There is now '--pic' command-line argument that causes position-independent
|
||||||
|
code to be generated (Linux and OSX only).
|
||||||
|
|
||||||
|
A number of additional performance improvements:
|
||||||
|
- Loops are now unrolled by default; the --opt=disable-loop-unroll
|
||||||
|
command-line argument can be used to disable this behavior.
|
||||||
|
(https://github.com/ispc/ispc/issues/78)
|
||||||
|
- A few more cases where gathers/scatters could be determined at compile
|
||||||
|
time to actually access contiguous locations have been added.
|
||||||
|
(https://github.com/ispc/ispc/issues/79)
|
||||||
|
|
||||||
|
Finally, warnings are now issued (if possible) when it can be determined
|
||||||
|
at compile-time that an out-of-bounds array index is being used.
|
||||||
|
(https://github.com/ispc/ispc/issues/98).
|
||||||
|
|
||||||
|
|
||||||
|
=== v1.0.7 === (3 September 2011)
|
||||||
|
|
||||||
|
The various atomic_*_global() standard library functions are generally
|
||||||
|
substantially more efficient. They all previously issued one hardware
|
||||||
|
atomic instruction for each running program instance but now locally
|
||||||
|
compute a reduction over the operands and issue a single hardware atomic,
|
||||||
|
giving the same effect and results in the end (issue #57).
|
||||||
|
|
||||||
|
CPU/ISA target handling has been substantially improved. If no CPU is
|
||||||
|
specified, the host CPU type is used, not just a default of "nehalem". A
|
||||||
|
number of bugs were fixed that ensure that LLVM doesn't generate SSE>2
|
||||||
|
instructions when using the SSE2 target (fixes issue #82).
|
||||||
|
|
||||||
|
Shift rights of unsigned integer types use a logical shift right
|
||||||
|
instruction now, not an arithmetic shift right (fixed issue #88).
|
||||||
|
|
||||||
|
When emitting header files, 'extern' declarations of globals used in ispc
|
||||||
|
code are now outside of the ispc namespace. Fixes issue #64.
|
||||||
|
|
||||||
|
The stencil example has been modified to do runs with and without
|
||||||
|
parallelism.
|
||||||
|
|
||||||
|
Many other small bugfixes and improvements.
|
||||||
|
|
||||||
=== v1.0.6 === (17 August 2011)
|
=== v1.0.6 === (17 August 2011)
|
||||||
|
|
||||||
Some additional cross-program instance operations have been added to the
|
Some additional cross-program instance operations have been added to the
|
||||||
|
|||||||
390
docs/ispc.txt
390
docs/ispc.txt
@@ -33,6 +33,17 @@ The main goals behind ``ispc`` are to:
|
|||||||
number of non-trivial workloads that aren't handled well by other
|
number of non-trivial workloads that aren't handled well by other
|
||||||
compilation approaches (e.g. loop auto-vectorization.)
|
compilation approaches (e.g. loop auto-vectorization.)
|
||||||
|
|
||||||
|
**We are very interested in your feedback and comments about ispc and
|
||||||
|
in hearing your experiences using the system. We are especially interested
|
||||||
|
in hearing if you try using ispc but see results that are not as you
|
||||||
|
were expecting or hoping for.** We encourage you to send a note with your
|
||||||
|
experiences or comments to the `ispc-users`_ mailing list or to file bug or
|
||||||
|
feature requests with the ``ispc`` `bug tracker`_. (Thanks!)
|
||||||
|
|
||||||
|
.. _ispc-users: http://groups.google.com/group/ispc-users
|
||||||
|
.. _bug tracker: https://github.com/ispc/ispc/issues?state=open
|
||||||
|
|
||||||
|
|
||||||
Contents:
|
Contents:
|
||||||
|
|
||||||
* `Recent Changes to ISPC`_
|
* `Recent Changes to ISPC`_
|
||||||
@@ -69,7 +80,8 @@ Contents:
|
|||||||
+ `Program Instance Convergence`_
|
+ `Program Instance Convergence`_
|
||||||
+ `Data Races`_
|
+ `Data Races`_
|
||||||
+ `Uniform Variables and Varying Control Flow`_
|
+ `Uniform Variables and Varying Control Flow`_
|
||||||
+ `Task Parallelism in ISPC`_
|
+ `Task Parallelism: Language Syntax`_
|
||||||
|
+ `Task Parallelism: Runtime Requirements`_
|
||||||
|
|
||||||
* `The ISPC Standard Library`_
|
* `The ISPC Standard Library`_
|
||||||
|
|
||||||
@@ -80,6 +92,7 @@ Contents:
|
|||||||
+ `Conversions To and From Half-Precision Floats`_
|
+ `Conversions To and From Half-Precision Floats`_
|
||||||
+ `Atomic Operations and Memory Fences`_
|
+ `Atomic Operations and Memory Fences`_
|
||||||
+ `Prefetches`_
|
+ `Prefetches`_
|
||||||
|
+ `System Information`_
|
||||||
+ `Low-Level Bits`_
|
+ `Low-Level Bits`_
|
||||||
|
|
||||||
* `Interoperability with the Application`_
|
* `Interoperability with the Application`_
|
||||||
@@ -102,6 +115,8 @@ Contents:
|
|||||||
+ `Small Performance Tricks`_
|
+ `Small Performance Tricks`_
|
||||||
+ `Instrumenting Your ISPC Programs`_
|
+ `Instrumenting Your ISPC Programs`_
|
||||||
+ `Using Scan Operations For Variable Output`_
|
+ `Using Scan Operations For Variable Output`_
|
||||||
|
+ `Application-Supplied Execution Masks`_
|
||||||
|
+ `Explicit Vector Programming With Uniform Short Vector Types`_
|
||||||
|
|
||||||
* `Disclaimer and Legal Information`_
|
* `Disclaimer and Legal Information`_
|
||||||
|
|
||||||
@@ -824,8 +839,8 @@ by default. If a function is declared with a ``static`` qualifier, then it
|
|||||||
is only visible in the file in which it was declared.
|
is only visible in the file in which it was declared.
|
||||||
|
|
||||||
Any function that can be launched with the ``launch`` construct in ``ispc``
|
Any function that can be launched with the ``launch`` construct in ``ispc``
|
||||||
must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more
|
must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_
|
||||||
discussion of launching tasks in ``ispc``.
|
for more discussion of launching tasks in ``ispc``.
|
||||||
|
|
||||||
Functions that are intended to be called from C/C++ application code must
|
Functions that are intended to be called from C/C++ application code must
|
||||||
have the ``export`` qualifier. This causes them to have regular C linkage
|
have the ``export`` qualifier. This causes them to have regular C linkage
|
||||||
@@ -926,8 +941,9 @@ execution model is critical for writing efficient and correct programs in
|
|||||||
|
|
||||||
``ispc`` supports both task parallelism to parallelize across multiple
|
``ispc`` supports both task parallelism to parallelize across multiple
|
||||||
cores and SPMD parallelism to parallelize across the SIMD vector lanes on a
|
cores and SPMD parallelism to parallelize across the SIMD vector lanes on a
|
||||||
single core. This section focuses on SPMD parallelism. See the section
|
single core. This section focuses on SPMD parallelism. See the sections
|
||||||
`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``.
|
`Task Parallelism: Language Syntax`_ and `Task Parallelism: Runtime
|
||||||
|
Requirements`_ for discussion of task parallelism in ``ispc``.
|
||||||
|
|
||||||
The SPMD-on-SIMD Execution Model
|
The SPMD-on-SIMD Execution Model
|
||||||
--------------------------------
|
--------------------------------
|
||||||
@@ -1174,7 +1190,7 @@ This code implicitly assumes that ``programCount`` evenly divides
|
|||||||
::
|
::
|
||||||
|
|
||||||
for (uniform int i = 0; i < count; i += programCount) {
|
for (uniform int i = 0; i < count; i += programCount) {
|
||||||
if (i + programIndex < programCount) {
|
if (i + programIndex < count) {
|
||||||
float d = data[i + programIndex];
|
float d = data[i + programIndex];
|
||||||
...
|
...
|
||||||
|
|
||||||
@@ -1370,112 +1386,190 @@ be modified in the above code even if *none* of the program instances
|
|||||||
evaluated a true value for the test, given the ``ispc`` execution model.
|
evaluated a true value for the test, given the ``ispc`` execution model.
|
||||||
|
|
||||||
|
|
||||||
Task Parallelism in ISPC
|
Task Parallelism: Language Syntax
|
||||||
------------------------
|
---------------------------------
|
||||||
|
|
||||||
One option for combining task-parallelism with ``ispc`` is to just use
|
One option for combining task-parallelism with ``ispc`` is to just use
|
||||||
regular task parallelism in the C/C++ application code (be it through
|
regular task parallelism in the C/C++ application code (be it through
|
||||||
Intel® Cilk(tm), Intel® Thread Building Blocks or another task system,
|
Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and
|
||||||
etc.), and for tasks to use ``ispc`` for SPMD parallelism across the vector
|
for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as
|
||||||
lanes as appropriate. Alternatively, ``ispc`` also has some support for
|
appropriate. Alternatively, ``ispc`` also has support for launching tasks
|
||||||
launching tasks from ``ispc`` code. The approach is similar to Intel®
|
from ``ispc`` code. The approach is similar to Intel® Cilk's task launch
|
||||||
Cilk's task launch feature. (See the ``examples/mandelbrot_tasks`` example
|
feature. (See the ``examples/mandelbrot_tasks`` example to see it used in
|
||||||
to see it used in a non-trivial example.)
|
a small example.)
|
||||||
|
|
||||||
Any function that is launched as a task must be declared with the ``task``
|
First, any function that is launched as a task must be declared with the
|
||||||
qualifier:
|
``task`` qualifier:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
task void func(uniform float a[], uniform int start) {
|
task void func(uniform float a[], uniform int index) {
|
||||||
....
|
...
|
||||||
|
a[index] = ....
|
||||||
}
|
}
|
||||||
|
|
||||||
Tasks must return ``void``; a compile time error is issued if a
|
Tasks must return ``void``; a compile time error is issued if a
|
||||||
non-``void`` task is defined.
|
non-``void`` task is defined.
|
||||||
|
|
||||||
Given a task, one can then write code that launches tasks as follows:
|
Given a task definitions, there are two ways to write code that launches
|
||||||
|
tasks, using the ``launch`` construct. First, one task can be launched at
|
||||||
|
a time, with parameters passed to the task to help it determine what part
|
||||||
|
of the overall computation it's responsible for:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
for (uniform int i = 0; i < 100; ++i)
|
for (uniform int i = 0; i < 100; ++i)
|
||||||
launch < func(a, i); >
|
launch < func(a, i) >;
|
||||||
|
|
||||||
Note the ``launch`` keyword and the brackets around the function call.
|
Note the ``launch`` keyword and the brackets around the function call.
|
||||||
This code launches 100 tasks, each of which presumably does some
|
This code launches 100 tasks, each of which presumably does some
|
||||||
computation keyed off of given the value ``i``. In general, one should
|
computation that is keyed off of given the value ``i``. In general, one
|
||||||
launch many more tasks than there are processors in the system to
|
should launch many more tasks than there are processors in the system to
|
||||||
ensure good load-balancing, but not so many that the overhead of scheduling
|
ensure good load-balancing, but not so many that the overhead of scheduling
|
||||||
and running tasks dominates the computation.
|
and running tasks dominates the computation.
|
||||||
|
|
||||||
Program execution continues asynchronously after task launch; thus, the
|
Alternatively, a number of tasks may be launched from a single ``launch``
|
||||||
function shouldn't access values being generated by the tasks without
|
statement. We might instead write the above example with a single
|
||||||
synchronization. A function uses a ``sync`` statement to wait for all
|
``launch`` like this:
|
||||||
launched tasks to finish:
|
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
for (uniform int i = 0; i < 100; ++i)
|
launch[100] < func2(a) >;
|
||||||
launch < func(a, i); >
|
|
||||||
|
Where an integer value (not necessarily a compile-time constant) is
|
||||||
|
provided to the ``launch`` keyword in square brackets; this number of tasks
|
||||||
|
will be enqueued to be run asynchronously. Within each of the tasks, two
|
||||||
|
special built-in variables are available--``taskIndex``, and ``taskCount``.
|
||||||
|
The first, ``taskIndex``, ranges from zero to one minus the number of tasks
|
||||||
|
provided to ``launch``, and ``taskCount`` equals the number of launched
|
||||||
|
taks. Thus, we might use ``taskIndex`` in the implementation of ``func2``
|
||||||
|
to determine which array element to process.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
task void func2(uniform float a[]) {
|
||||||
|
...
|
||||||
|
a[taskIndex] = ...
|
||||||
|
}
|
||||||
|
|
||||||
|
Program execution continues asynchronously after a ``launch`` statement;
|
||||||
|
thus, a function shouldn't access values being generated by the tasks it
|
||||||
|
has launched within the function without synchronization. If results are
|
||||||
|
needed before function return, a function can use a ``sync`` statement to
|
||||||
|
wait for all launched tasks to finish:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
launch[100] < func2(a) >;
|
||||||
sync;
|
sync;
|
||||||
// now safe to use computed values in a[]...
|
// now safe to use computed values in a[]...
|
||||||
|
|
||||||
Alternatively, any function that launches tasks has an implicit ``sync``
|
Alternatively, any function that launches tasks has an automatically-added
|
||||||
before it returns, so that functions that call a function that launches
|
``sync`` statement before it returns, so that functions that call a
|
||||||
tasks don't have to worry about outstanding asynchronous computation.
|
function that launches tasks don't have to worry about outstanding
|
||||||
|
asynchronous computation from that function.
|
||||||
|
|
||||||
Inside functions with the ``task`` qualifier, two additional built-in
|
Inside functions with the ``task`` qualifier, two additional built-in
|
||||||
variables are provided: ``threadIndex`` and ``threadCount``.
|
variables are provided in addition to ``taskIndex`` and ``taskCount``:
|
||||||
``threadCount`` gives the total number of hardware threads that have been
|
``threadIndex`` and ``threadCount``. ``threadCount`` gives the total
|
||||||
launched by the task system. ``threadIndex`` provides an index between
|
number of hardware threads that have been launched by the task system.
|
||||||
zero and ``threadCount-1`` that gives a unique index that corresponds to
|
``threadIndex`` provides an index between zero and ``threadCount-1`` that
|
||||||
the hardware thread that is executing the current task. The
|
gives a unique index that corresponds to the hardware thread that is
|
||||||
``threadIndex`` can be used for accessing data that is private to the
|
executing the current task. The ``threadIndex`` can be used for accessing
|
||||||
current thread and thus doesn't require synchronization to access under
|
data that is private to the current thread and thus doesn't require
|
||||||
parallel execution.
|
synchronization to access under parallel execution.
|
||||||
|
|
||||||
|
Task Parallelism: Runtime Requirements
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
If you use the task launch feature in ``ispc``, you must provide C/C++
|
If you use the task launch feature in ``ispc``, you must provide C/C++
|
||||||
implementations of two functions and link them into your final executable
|
implementations of three specific functions that manage launching and
|
||||||
file. Although these functions may be implemented in either language, they
|
synchronizing parallel tasks; these functions must be linked into your
|
||||||
must have "C" linkage (i.e. their prototypes must be declared inside an
|
executable. Although these functions may be implemented in any
|
||||||
``extern "C"`` block if they are defined in C++.)
|
language, they must have "C" linkage (i.e. their prototypes must be
|
||||||
|
declared inside an ``extern "C"`` block if they are defined in C++.)
|
||||||
|
|
||||||
|
By using user-supplied versions of these functions, ``ispc`` programs can
|
||||||
|
easily interoperate with software systems that have existing task systems
|
||||||
|
for managing parallelism. If you're using ``ispc`` with a system that
|
||||||
|
isn't otherwise multi-threaded and don't want to write custom
|
||||||
|
implementations of them, you can use the implementations of these functions
|
||||||
|
provided in the ``examples/tasksys.cpp`` file in the ``ispc``
|
||||||
|
distributions.
|
||||||
|
|
||||||
|
If you are implementing your own task system, the remainder of this section
|
||||||
|
discusses the requirements for these calls. You will also likely want to
|
||||||
|
review the example task systems in ``examples/tasksys.cpp`` for reference.
|
||||||
|
If you are not implmenting your own task system, you can skip reading the
|
||||||
|
remainder of this section.
|
||||||
|
|
||||||
|
Here are the declarations of the three functions that must be provided to
|
||||||
|
manage tasks in ``ispc``:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
void ISPCLaunch(void *funcptr, void *data);
|
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||||
void ISPCSync();
|
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
|
||||||
|
void ISPCSync(void *handle);
|
||||||
|
|
||||||
On Windows, two additional functions must be provided to dynamically
|
All three of these functions take an opaque handle (or a pointer to an
|
||||||
allocate and free memory to store the arguments passed to tasks. (On OSX
|
opaque handle) as their first parameter. This handle allows the task
|
||||||
and Linux, the stack provides memory for task arguments; on Windows, the
|
system runtime to distinguish between calls to these functions from
|
||||||
stack is generally not large enough to do this for large numbers of tasks.)
|
different functions in ``ispc`` code. In this way, the task system
|
||||||
|
implementation can efficiently wait for completion on just the tasks
|
||||||
|
launched from a single function.
|
||||||
|
|
||||||
|
The first time one of ``ISPCLaunch()`` or ``ISPCAlloc()`` is called in an
|
||||||
|
``ispc`` functon, the ``void *`` pointed to by the ``handlePtr`` parameter
|
||||||
|
will be ``NULL``. The implementations of these function should then
|
||||||
|
initialize ``*handlePtr`` to a unique handle value of some sort. (For
|
||||||
|
example, it might allocate a small structure to record which tasks were
|
||||||
|
launched by the current function.) In subsequent calls to these functions
|
||||||
|
in the emitted ``ispc`` code, the same value for ``handlePtr`` will be
|
||||||
|
passed in, such that loading from ``*handlePtr`` will retrieve the value
|
||||||
|
stored in the first call.
|
||||||
|
|
||||||
|
At function exit (or at an explicit ``sync`` statement), a call to
|
||||||
|
``ISPCSync()`` will be generated if ``*handlePtr`` is non-``NULL``.
|
||||||
|
Therefore, the handle value is passed directly to ``ISPCSync()``, rather
|
||||||
|
than a pointer to it, as in the other functions.
|
||||||
|
|
||||||
|
The ``ISPCAlloc()`` function is used to allocate small blocks of memory to
|
||||||
|
store parameters passed to tasks. It should return a pointer to memory
|
||||||
|
with the given aize and alignment. Note that there is no explicit
|
||||||
|
``ISPCFree()`` call; instead, all memory allocated within an ``ispc``
|
||||||
|
function should be freed when ``ISPCSync()`` is called.
|
||||||
|
|
||||||
|
``ISPCLaunch()`` is called to launch to launch one or more asynchronous
|
||||||
|
tasks. Each ``launch`` statement in ``ispc`` code causes a call to
|
||||||
|
``ISPCLaunch()`` to be emitted in the generated code. The three parameters
|
||||||
|
after the handle pointer to thie function are relatively straightforward;
|
||||||
|
the ``void *f`` parameter holds a pointer to a function to call to run the
|
||||||
|
work for this task, ``data`` holds a pointer to data to pass to this
|
||||||
|
function, and ``count`` is the number of instances of this function to
|
||||||
|
enqueue for asynchronous execution. (In other words, ``count`` corresponds
|
||||||
|
to the value ``n`` in a multiple-task launch statement like ``launch[n]``.)
|
||||||
|
|
||||||
|
The signature of the provided function pointer ``f`` is
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount,
|
||||||
void ISPCFree(void *ptr);
|
int taskIndex, int taskCount)
|
||||||
|
|
||||||
These are called by the task launch code generated by the ``ispc``
|
When this function pointer is called by one of the hardware threads managed
|
||||||
compiler; the first is called to launch to launch a task and the second is
|
bythe task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
|
||||||
called to wait for, respectively. (Factoring them out in this way
|
be passed to it for its first parameter; ``threadCount`` gives the total
|
||||||
allows ``ispc`` to inter-operate with the application's task system, if
|
number of hardware threads that have been spawned to run tasks and
|
||||||
any, rather than having a separate one of its own.) To run a particular
|
``threadIndex`` should be an integer index between zero and ``threadCount``
|
||||||
task, the task system should cast the function pointer to a ``void (*)(void
|
uniquely identifying the hardware thread that is running the task. (These
|
||||||
*, int, int)`` function pointer and then call it with the provided ``void
|
values can be used to index into thread-local storage.)
|
||||||
*`` data and then an index for the current hardware thread and the total
|
|
||||||
number of hardware threads the task system has launched--in other words:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
typedef void (*TaskFuncType)(void *, int, int);
|
|
||||||
TaskFuncType tft = (TaskFuncType)(funcptr);
|
|
||||||
tft(data, threadIndex, threadCount);
|
|
||||||
|
|
||||||
A number of sample task system implementations are provided with ``ispc``;
|
|
||||||
see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and
|
|
||||||
``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of
|
|
||||||
the ``ispc`` distribution.
|
|
||||||
|
|
||||||
|
The value of ``taskCount`` should be the number of tasks launched in the
|
||||||
|
``launch`` statement that caused the call to ``ISPCLaunch()`` and each of
|
||||||
|
the calls to this function should be given a unique value of ``taskIndex``
|
||||||
|
between zero and ``taskCount``, to distinguish which of the instances
|
||||||
|
of the set of launched tasks is running.
|
||||||
|
|
||||||
The ISPC Standard Library
|
The ISPC Standard Library
|
||||||
=========================
|
=========================
|
||||||
@@ -2020,12 +2114,12 @@ end.)
|
|||||||
|
|
||||||
One thing to note is that that the value being added to here is a
|
One thing to note is that that the value being added to here is a
|
||||||
``uniform`` integer, while the increment amount and the return value are
|
``uniform`` integer, while the increment amount and the return value are
|
||||||
``varying``. In other words, the semantics are that each running program
|
``varying``. In other words, the semantics of this call are that each
|
||||||
instance individually issues the atomic operation with its own ``delta``
|
running program instance individually issues the atomic operation with its
|
||||||
value and gets the previous value of ``val`` back in return. The atomics
|
own ``delta`` value and gets the previous value of ``val`` back in return.
|
||||||
for the running program instances may be issued in arbitrary order; it's
|
The atomics for the running program instances may be issued in arbitrary
|
||||||
not guaranteed that they will be issued in ``programIndex`` order, for
|
order; it's not guaranteed that they will be issued in ``programIndex``
|
||||||
example.
|
order, for example.
|
||||||
|
|
||||||
Here are the declarations of the ``int32`` variants of these functions.
|
Here are the declarations of the ``int32`` variants of these functions.
|
||||||
There are also ``int64`` equivalents as well as variants that take
|
There are also ``int64`` equivalents as well as variants that take
|
||||||
@@ -2043,17 +2137,44 @@ function can be used with ``float`` and ``double`` types as well.)
|
|||||||
int32 atomic_xor_global(reference uniform int32 val, int32 value)
|
int32 atomic_xor_global(reference uniform int32 val, int32 value)
|
||||||
int32 atomic_swap_global(reference uniform int32 val, int32 newval)
|
int32 atomic_swap_global(reference uniform int32 val, int32 newval)
|
||||||
|
|
||||||
There is also an atomic "compare and exchange" function; it atomically
|
There are also variants of these functions that take ``uniform`` values for
|
||||||
compares the value in "val" to "compare"--if they match, it assigns
|
the operand and return a ``uniform`` result:
|
||||||
"newval" to "val". In either case, the old value of "val" is returned.
|
|
||||||
(As with the other atomic operations, there are also ``unsigned`` and
|
|
||||||
64-bit variants of this function. Furthermore, there are ``float`` and
|
|
||||||
``double`` variants as well.)
|
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
|
uniform int32 atomic_add_global(reference uniform int32 val,
|
||||||
|
uniform int32 value)
|
||||||
|
uniform int32 atomic_subtract_global(reference uniform int32 val,
|
||||||
|
uniform int32 value)
|
||||||
|
uniform int32 atomic_min_global(reference uniform int32 val,
|
||||||
|
uniform int32 value)
|
||||||
|
uniform int32 atomic_max_global(reference uniform int32 val,
|
||||||
|
uniform int32 value)
|
||||||
|
uniform int32 atomic_and_global(reference uniform int32 val,
|
||||||
|
uniform int32 value)
|
||||||
|
uniform int32 atomic_or_global(reference uniform int32 val,
|
||||||
|
uniform int32 value)
|
||||||
|
uniform int32 atomic_xor_global(reference uniform int32 val,
|
||||||
|
uniform int32 value)
|
||||||
|
uniform int32 atomic_swap_global(reference uniform int32 val,
|
||||||
|
uniform int32 newval)
|
||||||
|
|
||||||
|
There are also an atomic swap and "compare and exchange" functions.
|
||||||
|
Compare and exchange atomically compares the value in "val" to
|
||||||
|
"compare"--if they match, it assigns "newval" to "val". In either case,
|
||||||
|
the old value of "val" is returned. (As with the other atomic operations,
|
||||||
|
there are also ``unsigned`` and 64-bit variants of this function.
|
||||||
|
Furthermore, there are ``float`` and ``double`` variants as well.)
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
int32 atomic_swap_global(reference uniform int32 val, int32 new)
|
||||||
|
uniform int32 atomic_swap_global(reference uniform int32 val,
|
||||||
|
uniform int32 new)
|
||||||
int32 atomic_compare_exchange_global(reference uniform int32 val,
|
int32 atomic_compare_exchange_global(reference uniform int32 val,
|
||||||
int32 compare, int32 newval)
|
int32 compare, int32 newval)
|
||||||
|
uniform int32 atomic_compare_exchange_global(reference uniform int32 val,
|
||||||
|
uniform int32 compare, uniform int32 newval)
|
||||||
|
|
||||||
``ispc`` also has a standard library routine that inserts a memory barrier
|
``ispc`` also has a standard library routine that inserts a memory barrier
|
||||||
into the code; it ensures that all memory reads and writes prior to be
|
into the code; it ensures that all memory reads and writes prior to be
|
||||||
@@ -2102,6 +2223,20 @@ These functions are available for all of the basic types in the
|
|||||||
language--``int8``, ``int16``, ``int32``, ``float``, and so forth.
|
language--``int8``, ``int16``, ``int32``, ``float``, and so forth.
|
||||||
|
|
||||||
|
|
||||||
|
System Information
|
||||||
|
------------------
|
||||||
|
|
||||||
|
A routine is available to find the number of CPU cores available in the
|
||||||
|
system:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
int num_cores()
|
||||||
|
|
||||||
|
This value can be useful for adapting the granularity of parallel task
|
||||||
|
decomposition depending on the number of processors in the system.
|
||||||
|
|
||||||
|
|
||||||
Low-Level Bits
|
Low-Level Bits
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
@@ -2209,14 +2344,14 @@ Both the ``foo`` and ``bar`` global variables can be accessed on each
|
|||||||
side.
|
side.
|
||||||
|
|
||||||
``ispc`` code can also call back to C/C++. On the ``ispc`` side, any
|
``ispc`` code can also call back to C/C++. On the ``ispc`` side, any
|
||||||
application functions to be called must be declared with the ``export "C"``
|
application functions to be called must be declared with the ``extern "C"``
|
||||||
qualifier.
|
qualifier.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
extern "C" void foo(uniform float f, uniform float g);
|
extern "C" void foo(uniform float f, uniform float g);
|
||||||
|
|
||||||
Unlike in C++, ``export "C"`` doesn't take braces to delineate
|
Unlike in C++, ``extern "C"`` doesn't take braces to delineate
|
||||||
multiple functions to be declared; thus, multiple C functions to be called
|
multiple functions to be declared; thus, multiple C functions to be called
|
||||||
from ``ispc`` must be declared as follows:
|
from ``ispc`` must be declared as follows:
|
||||||
|
|
||||||
@@ -2843,6 +2978,91 @@ values to ``outArray[1]`` and ``outArray[2]``, and so forth. The
|
|||||||
``reduce_add`` call at the end returns the total number of values that the
|
``reduce_add`` call at the end returns the total number of values that the
|
||||||
program instances have written to the array.
|
program instances have written to the array.
|
||||||
|
|
||||||
|
Application-Supplied Execution Masks
|
||||||
|
------------------------------------
|
||||||
|
|
||||||
|
Recall that when execution transitions from the application code to an
|
||||||
|
``ispc`` function, all of the program instances are initially executing.
|
||||||
|
In some cases, it may desired that only some of them are running, based on
|
||||||
|
a data-dependent condition computed in the application program. This
|
||||||
|
situation can easily be handled via an additional parameter from the
|
||||||
|
application.
|
||||||
|
|
||||||
|
As a simple example, consider a case where the application code has an
|
||||||
|
array of ``float`` values and we'd like the ``ispc`` code to update
|
||||||
|
just specific values in that array, where which of those values to be
|
||||||
|
updated has been determined by the application. In C++ code, we might
|
||||||
|
have:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
int count = ...;
|
||||||
|
float *array = new float[count];
|
||||||
|
bool *shouldUpdate = new bool[count];
|
||||||
|
// initialize array and shouldUpdate
|
||||||
|
ispc_func(array, shouldUpdate, count);
|
||||||
|
|
||||||
|
Then, the ``ispc`` code could process this update as:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
export void ispc_func(uniform float array[], uniform bool update[],
|
||||||
|
uniform int count) {
|
||||||
|
for (uniform int i = 0; i < count; i += programCount) {
|
||||||
|
cif (update[i+programIndex] == true)
|
||||||
|
// update array[i+programIndex]...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(In this case a "coherent" if statement is likely to be worthwhile if the
|
||||||
|
``update`` array will tend to have sections that are either all-true or
|
||||||
|
all-false.)
|
||||||
|
|
||||||
|
Explicit Vector Programming With Uniform Short Vector Types
|
||||||
|
-----------------------------------------------------------
|
||||||
|
|
||||||
|
The typical model for programming in ``ispc`` is an *implicit* parallel
|
||||||
|
model, where one writes a program that is apparently doing scalar
|
||||||
|
computation on values and the program is then vectorized to run in parallel
|
||||||
|
across the SIMD lanes of a processor. However, ``ispc`` also has some
|
||||||
|
support for explicit vector unit programming, where the vectorization is
|
||||||
|
explicit. Some computations may be more effectively described in the
|
||||||
|
explicit model rather than the implicit model.
|
||||||
|
|
||||||
|
This support is provided via ``uniform`` instances of short vectors
|
||||||
|
(as were introduced in the `Short Vector Types`_ section). Specifically,
|
||||||
|
if this short program
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
export uniform float<8> madd(uniform float<8> a,
|
||||||
|
uniform float<8> b, uniform float<8> c) {
|
||||||
|
return a + b * c;
|
||||||
|
}
|
||||||
|
|
||||||
|
is compiled with the AVX target, ``ispc`` generates the following assembly:
|
||||||
|
|
||||||
|
::
|
||||||
|
_madd:
|
||||||
|
vmulps %ymm2, %ymm1, %ymm1
|
||||||
|
vaddps %ymm0, %ymm1, %ymm0
|
||||||
|
ret
|
||||||
|
|
||||||
|
(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
|
||||||
|
``addps`` instructions are generated, and so forth.)
|
||||||
|
|
||||||
|
Note that ``ispc`` doesn't currently support control-flow based on
|
||||||
|
``uniform`` short vector types; it is thus not possible to write code like:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
|
||||||
|
uniform int<8> sum = 0;
|
||||||
|
while (a++ < b)
|
||||||
|
++sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Disclaimer and Legal Information
|
Disclaimer and Legal Information
|
||||||
================================
|
================================
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
|||||||
# This could be handy for archiving the generated documentation or
|
# This could be handy for archiving the generated documentation or
|
||||||
# if some version control system is used.
|
# if some version control system is used.
|
||||||
|
|
||||||
PROJECT_NUMBER = 1.0.6
|
PROJECT_NUMBER = 1.0.10
|
||||||
|
|
||||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||||
# base path where the generated documentation will be put.
|
# base path where the generated documentation will be put.
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ the runtimes and the speedup delivered by ispc. It may be instructive to
|
|||||||
do a side-by-side diff of the C++ and ispc implementations of these
|
do a side-by-side diff of the C++ and ispc implementations of these
|
||||||
algorithms to learn more about wirting ispc code.
|
algorithms to learn more about wirting ispc code.
|
||||||
|
|
||||||
|
|
||||||
AOBench
|
AOBench
|
||||||
=======
|
=======
|
||||||
|
|
||||||
@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
|
|||||||
(xres x yres) image each time and measuring the computation time with both
|
(xres x yres) image each time and measuring the computation time with both
|
||||||
serial and ispc implementations.
|
serial and ispc implementations.
|
||||||
|
|
||||||
|
|
||||||
AOBench_Instrumented
|
AOBench_Instrumented
|
||||||
====================
|
====================
|
||||||
|
|
||||||
@@ -40,12 +42,47 @@ is provided in the instrument.cpp file.
|
|||||||
*** Note: on Linux, this example currently hits an assertion in LLVM during
|
*** Note: on Linux, this example currently hits an assertion in LLVM during
|
||||||
*** compilation
|
*** compilation
|
||||||
|
|
||||||
|
|
||||||
|
Deferred
|
||||||
|
========
|
||||||
|
|
||||||
|
This example shows an extensive example of using ispc for efficient
|
||||||
|
deferred shading of scenes with thousands of lights; it's an implementation
|
||||||
|
of the algorithm that Johan Andersson described at SIGGRAPH 2009,
|
||||||
|
implemented by Andrew Lauritzen and Jefferson Montgomery. The basic idea
|
||||||
|
is that a pre-rendered G-buffer is partitioned into tiles, and in each
|
||||||
|
tile, the set of lights that contribute to the tile is first computed.
|
||||||
|
Then, the pixels in the tile are then shaded using just those light
|
||||||
|
sources. (See slides 19-29 of
|
||||||
|
http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
|
||||||
|
for more details on the algorithm.)
|
||||||
|
|
||||||
|
This directory includes three implementations of the algorithm:
|
||||||
|
|
||||||
|
- An ispc implementation that first does a static partitioning of the
|
||||||
|
screen into tiles to parallelize across the CPU cores. Within each tile
|
||||||
|
ispc kernels provide highly efficient implementations of the light
|
||||||
|
culling and shading calculations.
|
||||||
|
- A "best practices" serial C++ implementation. This implementation does a
|
||||||
|
dynamic partitioning of the screen, refining tiles with significant Z
|
||||||
|
depth complexity (these tiles often have a large number of lights that
|
||||||
|
affect them). Within each final tile, the pixels are shaded using
|
||||||
|
regular C++ code.
|
||||||
|
- If the Cilk extensions are available in your compiler, an ispc
|
||||||
|
implementation that uses Cilk will also be built.
|
||||||
|
(See http://software.intel.com/en-us/articles/intel-cilk-plus/). Like
|
||||||
|
the "best practices" serial implementation, this version does dynamic
|
||||||
|
tile partitioning for better load balancing and then uses ispc for the
|
||||||
|
light culling and shading.
|
||||||
|
|
||||||
|
|
||||||
Mandelbrot
|
Mandelbrot
|
||||||
==========
|
==========
|
||||||
|
|
||||||
Mandelbrot set generation. This example is extensively documented at the
|
Mandelbrot set generation. This example is extensively documented at the
|
||||||
http://ispc.github.com/example.html page.
|
http://ispc.github.com/example.html page.
|
||||||
|
|
||||||
|
|
||||||
Mandelbrot_tasks
|
Mandelbrot_tasks
|
||||||
================
|
================
|
||||||
|
|
||||||
@@ -58,6 +95,7 @@ using tasks with ispc, no task system is mandated; the user is free to plug
|
|||||||
in any task system they want, for ease of interoperating with existing task
|
in any task system they want, for ease of interoperating with existing task
|
||||||
systems.
|
systems.
|
||||||
|
|
||||||
|
|
||||||
Noise
|
Noise
|
||||||
=====
|
=====
|
||||||
|
|
||||||
@@ -71,6 +109,7 @@ Options
|
|||||||
This program implements both the Black-Scholes and Binomial options pricing
|
This program implements both the Black-Scholes and Binomial options pricing
|
||||||
models in both ispc and regular serial C++ code.
|
models in both ispc and regular serial C++ code.
|
||||||
|
|
||||||
|
|
||||||
RT
|
RT
|
||||||
==
|
==
|
||||||
|
|
||||||
@@ -87,6 +126,7 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
|
|||||||
"Physically Based Rendering" book for more about the basic algorithmic
|
"Physically Based Rendering" book for more about the basic algorithmic
|
||||||
details.
|
details.
|
||||||
|
|
||||||
|
|
||||||
Simple
|
Simple
|
||||||
======
|
======
|
||||||
|
|
||||||
@@ -94,6 +134,7 @@ This is a simple "hello world" type program that shows a ~10 line
|
|||||||
application program calling out to a ~5 line ispc program to do a simple
|
application program calling out to a ~5 line ispc program to do a simple
|
||||||
computation.
|
computation.
|
||||||
|
|
||||||
|
|
||||||
Volume
|
Volume
|
||||||
======
|
======
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,14 @@
|
|||||||
|
|
||||||
CXX=g++ -m64
|
ARCH = $(shell uname)
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
|
||||||
|
TASK_CXX=../tasksys.cpp
|
||||||
|
TASK_LIB=-lpthread
|
||||||
|
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||||
|
|
||||||
|
CXX=g++
|
||||||
|
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --fast-math --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||||
|
|
||||||
default: ao
|
default: ao
|
||||||
|
|
||||||
@@ -14,12 +20,15 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ ao
|
/bin/rm -rf objs *~ ao
|
||||||
|
|
||||||
ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
|
ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
|
||||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
|
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
|
|
||||||
|
objs/%.o: ../%.cpp
|
||||||
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
|
|
||||||
objs/ao.o: objs/ao_ispc.h
|
objs/ao.o: objs/ao_ispc.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
|
|||||||
@@ -173,10 +173,30 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Report results and save image
|
// Report results and save image
|
||||||
printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC,
|
printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n",
|
||||||
width, height);
|
minTimeISPC, width, height);
|
||||||
savePPM("ao-ispc.ppm", width, height);
|
savePPM("ao-ispc.ppm", width, height);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Run the ispc + tasks path, test_iterations times, and report the
|
||||||
|
// minimum time for any of them.
|
||||||
|
//
|
||||||
|
double minTimeISPCTasks = 1e30;
|
||||||
|
for (unsigned int i = 0; i < test_iterations; i++) {
|
||||||
|
memset((void *)fimg, 0, sizeof(float) * width * height * 3);
|
||||||
|
assert(NSUBSAMPLES == 2);
|
||||||
|
|
||||||
|
reset_and_start_timer();
|
||||||
|
ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
|
||||||
|
double t = get_elapsed_mcycles();
|
||||||
|
minTimeISPCTasks = std::min(minTimeISPCTasks, t);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report results and save image
|
||||||
|
printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n",
|
||||||
|
minTimeISPCTasks, width, height);
|
||||||
|
savePPM("ao-ispc-tasks.ppm", width, height);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Run the serial path, again test_iteration times, and report the
|
// Run the serial path, again test_iteration times, and report the
|
||||||
// minimum time.
|
// minimum time.
|
||||||
@@ -193,7 +213,8 @@ int main(int argc, char **argv)
|
|||||||
// Report more results, save another image...
|
// Report more results, save another image...
|
||||||
printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial,
|
printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial,
|
||||||
width, height);
|
width, height);
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
|
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||||
|
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||||
savePPM("ao-serial.ppm", width, height);
|
savePPM("ao-serial.ppm", width, height);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
|
|||||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||||
of width w and height h.
|
of width w and height h.
|
||||||
*/
|
*/
|
||||||
void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||||
uniform int nsubsamples, reference uniform float image[]) {
|
uniform int h, uniform int nsubsamples,
|
||||||
|
reference uniform float image[]) {
|
||||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||||
static Sphere spheres[3] = {
|
static Sphere spheres[3] = {
|
||||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||||
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
|||||||
// direction we do per iteration and ny the number in y.
|
// direction we do per iteration and ny the number in y.
|
||||||
uniform int nx = 1, ny = 1;
|
uniform int nx = 1, ny = 1;
|
||||||
|
|
||||||
|
// FIXME: We actually need ny to be 1 regardless of the decomposition,
|
||||||
|
// since the task decomposition is one scanline high.
|
||||||
|
|
||||||
if (programCount == 8) {
|
if (programCount == 8) {
|
||||||
// Do two pixels at once in the x direction
|
// Do two pixels at once in the x direction
|
||||||
nx = 2;
|
nx = 2;
|
||||||
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
|||||||
++du;
|
++du;
|
||||||
}
|
}
|
||||||
else if (programCount == 16) {
|
else if (programCount == 16) {
|
||||||
// Two at once in both x and y
|
nx = 4;
|
||||||
nx = ny = 2;
|
ny = 1;
|
||||||
if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
|
if (programIndex >= 4 && programIndex < 8)
|
||||||
++du;
|
++du;
|
||||||
if (programIndex >= 8)
|
if (programIndex >= 8 && programIndex < 12)
|
||||||
++dv;
|
du += 2;
|
||||||
|
if (programIndex >= 12)
|
||||||
|
du += 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now loop over all of the pixels, stepping in x and y as calculated
|
// Now loop over all of the pixels, stepping in x and y as calculated
|
||||||
// above. (Assumes that ny divides y and nx divides x...)
|
// above. (Assumes that ny divides y and nx divides x...)
|
||||||
for (uniform int y = y0; y < y1; y += ny) {
|
for (uniform int y = y0; y < y1; y += ny) {
|
||||||
for (uniform int x = 0; x < w; x += nx) {
|
for (uniform int x = 0; x < w; x += nx) {
|
||||||
// Figur out x,y pixel in NDC
|
// Figure out x,y pixel in NDC
|
||||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||||
float ret = 0.f;
|
float ret = 0.f;
|
||||||
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
|||||||
|
|
||||||
// offset to the first pixel in the image
|
// offset to the first pixel in the image
|
||||||
uniform int offset = 3 * (y * w + x);
|
uniform int offset = 3 * (y * w + x);
|
||||||
for (uniform int p = 0; p < programCount; p += 4, ++offset) {
|
for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
|
||||||
// Get the four sample values for this pixel
|
// Get the four sample values for this pixel
|
||||||
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
||||||
retArray[p+3];
|
retArray[p+3];
|
||||||
@@ -315,3 +321,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
|||||||
uniform float image[]) {
|
uniform float image[]) {
|
||||||
ao_scanlines(0, h, w, h, nsubsamples, image);
|
ao_scanlines(0, h, w, h, nsubsamples, image);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void task ao_task(uniform int width, uniform int height,
|
||||||
|
uniform int nsubsamples, uniform float image[]) {
|
||||||
|
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||||
|
uniform float image[]) {
|
||||||
|
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
||||||
|
}
|
||||||
|
|||||||
@@ -140,7 +140,7 @@ ray_plane_intersect(Isect &isect, Ray &ray,
|
|||||||
float d = -dot(plane.p, plane.n);
|
float d = -dot(plane.p, plane.n);
|
||||||
float v = dot(ray.dir, plane.n);
|
float v = dot(ray.dir, plane.n);
|
||||||
|
|
||||||
if (fabsf(v) < 1.0e-17)
|
if (fabsf(v) < 1.0e-17f)
|
||||||
return;
|
return;
|
||||||
else {
|
else {
|
||||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||||
@@ -183,11 +183,11 @@ orthoBasis(vec basis[3], const vec &n) {
|
|||||||
basis[2] = n;
|
basis[2] = n;
|
||||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||||
|
|
||||||
if ((n.x < 0.6) && (n.x > -0.6)) {
|
if ((n.x < 0.6f) && (n.x > -0.6f)) {
|
||||||
basis[1].x = 1.0;
|
basis[1].x = 1.0;
|
||||||
} else if ((n.y < 0.6) && (n.y > -0.6)) {
|
} else if ((n.y < 0.6f) && (n.y > -0.6f)) {
|
||||||
basis[1].y = 1.0;
|
basis[1].y = 1.0;
|
||||||
} else if ((n.z < 0.6) && (n.z > -0.6)) {
|
} else if ((n.z < 0.6f) && (n.z > -0.6f)) {
|
||||||
basis[1].z = 1.0;
|
basis[1].z = 1.0;
|
||||||
} else {
|
} else {
|
||||||
basis[1].x = 1.0;
|
basis[1].x = 1.0;
|
||||||
@@ -224,7 +224,7 @@ ambient_occlusion(Isect &isect, Plane &plane,
|
|||||||
float phi = 2.0f * M_PI * drand48();
|
float phi = 2.0f * M_PI * drand48();
|
||||||
float x = cosf(phi) * theta;
|
float x = cosf(phi) * theta;
|
||||||
float y = sinf(phi) * theta;
|
float y = sinf(phi) * theta;
|
||||||
float z = sqrtf(1.0 - theta * theta);
|
float z = sqrtf(1.0f - theta * theta);
|
||||||
|
|
||||||
// local . global
|
// local . global
|
||||||
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
||||||
@@ -236,14 +236,14 @@ ambient_occlusion(Isect &isect, Plane &plane,
|
|||||||
ray.dir.y = ry;
|
ray.dir.y = ry;
|
||||||
ray.dir.z = rz;
|
ray.dir.z = rz;
|
||||||
|
|
||||||
occIsect.t = 1.0e+17;
|
occIsect.t = 1.0e+17f;
|
||||||
occIsect.hit = 0;
|
occIsect.hit = 0;
|
||||||
|
|
||||||
for (int snum = 0; snum < 3; ++snum)
|
for (int snum = 0; snum < 3; ++snum)
|
||||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||||
ray_plane_intersect (occIsect, ray, plane);
|
ray_plane_intersect (occIsect, ray, plane);
|
||||||
|
|
||||||
if (occIsect.hit) occlusion += 1.0;
|
if (occIsect.hit) occlusion += 1.f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -280,10 +280,10 @@ static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,
|
|||||||
|
|
||||||
ray.dir.x = px;
|
ray.dir.x = px;
|
||||||
ray.dir.y = py;
|
ray.dir.y = py;
|
||||||
ray.dir.z = -1.0;
|
ray.dir.z = -1.0f;
|
||||||
vnormalize(ray.dir);
|
vnormalize(ray.dir);
|
||||||
|
|
||||||
isect.t = 1.0e+17;
|
isect.t = 1.0e+17f;
|
||||||
isect.hit = 0;
|
isect.hit = 0;
|
||||||
|
|
||||||
for (int snum = 0; snum < 3; ++snum)
|
for (int snum = 0; snum < 3; ++snum)
|
||||||
|
|||||||
3
examples/aobench/aobench.vcxproj
Executable file → Normal file
3
examples/aobench/aobench.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup Label="ProjectConfigurations">
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
<ProjectConfiguration Include="Debug|Win32">
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
@@ -21,6 +21,7 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="ao.cpp" />
|
<ClCompile Include="ao.cpp" />
|
||||||
<ClCompile Include="ao_serial.cpp" />
|
<ClCompile Include="ao_serial.cpp" />
|
||||||
|
<ClCompile Include="../tasksys.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="ao.ispc">
|
<CustomBuild Include="ao.ispc">
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
CXX=g++ -m64
|
CXX=g++ -m64
|
||||||
CXXFLAGS=-Iobjs/ -g3 -Wall
|
CXXFLAGS=-Iobjs/ -g3 -Wall
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
|
ISPCFLAGS=-O2 --instrument --arch=x86-64
|
||||||
|
|
||||||
default: ao
|
default: ao
|
||||||
|
|
||||||
|
|||||||
0
examples/aobench_instrumented/aobench_instrumented.vcxproj
Executable file → Normal file
0
examples/aobench_instrumented/aobench_instrumented.vcxproj
Executable file → Normal file
42
examples/deferred/Makefile
Normal file
42
examples/deferred/Makefile
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
|
||||||
|
ARCH = $(shell uname)
|
||||||
|
|
||||||
|
TASK_CXX=../tasks_pthreads.cpp
|
||||||
|
TASK_LIB=-lpthread
|
||||||
|
|
||||||
|
ifeq ($(ARCH), Darwin)
|
||||||
|
TASK_CXX=../tasks_gcd.cpp
|
||||||
|
TASK_LIB=
|
||||||
|
endif
|
||||||
|
|
||||||
|
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||||
|
|
||||||
|
CXX=g++
|
||||||
|
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||||
|
ISPC=ispc
|
||||||
|
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast
|
||||||
|
|
||||||
|
OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o
|
||||||
|
|
||||||
|
default: deferred_shading
|
||||||
|
|
||||||
|
.PHONY: dirs clean
|
||||||
|
.PRECIOUS: objs/kernels_ispc.h
|
||||||
|
|
||||||
|
dirs:
|
||||||
|
/bin/mkdir -p objs/
|
||||||
|
|
||||||
|
clean:
|
||||||
|
/bin/rm -rf objs *~ deferred_shading
|
||||||
|
|
||||||
|
deferred_shading: dirs $(OBJS) $(TASK_OBJ)
|
||||||
|
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
|
||||||
|
|
||||||
|
objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
|
||||||
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
|
|
||||||
|
objs/%.o: ../%.cpp
|
||||||
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
|
|
||||||
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
209
examples/deferred/common.cpp
Normal file
209
examples/deferred/common.cpp
Normal file
@@ -0,0 +1,209 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <float.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <vector>
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif
|
||||||
|
#include "deferred.h"
|
||||||
|
#include "../timing.h"
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
static void *
|
||||||
|
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
return _aligned_malloc(size, alignment);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
return memalign(alignment, size);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||||
|
char *amem = ((char*)mem) + sizeof(void*);
|
||||||
|
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||||
|
(alignment - 1)));
|
||||||
|
((void**)amem)[-1] = mem;
|
||||||
|
return amem;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
lAlignedFree(void *ptr) {
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
_aligned_free(ptr);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
free(ptr);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
free(((void**)ptr)[-1]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Framebuffer::Framebuffer(int width, int height) {
|
||||||
|
nPixels = width*height;
|
||||||
|
r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||||
|
g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||||
|
b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Framebuffer::~Framebuffer() {
|
||||||
|
lAlignedFree(r);
|
||||||
|
lAlignedFree(g);
|
||||||
|
lAlignedFree(b);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
Framebuffer::clear() {
|
||||||
|
memset(r, 0, nPixels);
|
||||||
|
memset(g, 0, nPixels);
|
||||||
|
memset(b, 0, nPixels);
|
||||||
|
}
|
||||||
|
|
||||||
|
InputData *
|
||||||
|
CreateInputDataFromFile(const char *path) {
|
||||||
|
FILE *in = fopen(path, "rb");
|
||||||
|
if (!in) return 0;
|
||||||
|
|
||||||
|
InputData *input = new InputData;
|
||||||
|
|
||||||
|
// Load header
|
||||||
|
if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
|
||||||
|
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load data chunk and update pointers
|
||||||
|
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
||||||
|
ALIGNMENT_BYTES);
|
||||||
|
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
|
||||||
|
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
input->arrays.zBuffer =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
|
||||||
|
input->arrays.normalEncoded_x =
|
||||||
|
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
|
||||||
|
input->arrays.normalEncoded_y =
|
||||||
|
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
|
||||||
|
input->arrays.specularAmount =
|
||||||
|
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
|
||||||
|
input->arrays.specularPower =
|
||||||
|
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
|
||||||
|
input->arrays.albedo_x =
|
||||||
|
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
|
||||||
|
input->arrays.albedo_y =
|
||||||
|
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
|
||||||
|
input->arrays.albedo_z =
|
||||||
|
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
|
||||||
|
input->arrays.lightPositionView_x =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
|
||||||
|
input->arrays.lightPositionView_y =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
|
||||||
|
input->arrays.lightPositionView_z =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
|
||||||
|
input->arrays.lightAttenuationBegin =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
|
||||||
|
input->arrays.lightColor_x =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
|
||||||
|
input->arrays.lightColor_y =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
|
||||||
|
input->arrays.lightColor_z =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
|
||||||
|
input->arrays.lightAttenuationEnd =
|
||||||
|
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
|
||||||
|
|
||||||
|
fclose(in);
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void DeleteInputData(InputData *input)
|
||||||
|
{
|
||||||
|
lAlignedFree(input->chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void WriteFrame(const char *filename, const InputData *input,
|
||||||
|
const Framebuffer &framebuffer) {
|
||||||
|
// Deswizzle and copy to RGBA output
|
||||||
|
// Doesn't need to be fast... only happens once
|
||||||
|
size_t imageBytes = 3 * input->header.framebufferWidth *
|
||||||
|
input->header.framebufferHeight;
|
||||||
|
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
|
||||||
|
memset(framebufferAOS, 0, imageBytes);
|
||||||
|
|
||||||
|
for (int i = 0; i < input->header.framebufferWidth *
|
||||||
|
input->header.framebufferHeight; ++i) {
|
||||||
|
framebufferAOS[3 * i + 0] = framebuffer.r[i];
|
||||||
|
framebufferAOS[3 * i + 1] = framebuffer.g[i];
|
||||||
|
framebufferAOS[3 * i + 2] = framebuffer.b[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write out simple PPM file
|
||||||
|
FILE *out = fopen(filename, "wb");
|
||||||
|
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||||
|
input->header.framebufferHeight);
|
||||||
|
fwrite(framebufferAOS, imageBytes, 1, out);
|
||||||
|
|
||||||
|
lAlignedFree(framebufferAOS);
|
||||||
|
}
|
||||||
BIN
examples/deferred/data/pp1280x720.bin
Normal file
BIN
examples/deferred/data/pp1280x720.bin
Normal file
Binary file not shown.
BIN
examples/deferred/data/pp1920x1200.bin
Normal file
BIN
examples/deferred/data/pp1920x1200.bin
Normal file
Binary file not shown.
108
examples/deferred/deferred.h
Normal file
108
examples/deferred/deferred.h
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef DEFERRED_H
|
||||||
|
#define DEFERRED_H
|
||||||
|
|
||||||
|
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||||
|
#define MIN_TILE_WIDTH 16
|
||||||
|
#define MIN_TILE_HEIGHT 16
|
||||||
|
#define MAX_LIGHTS 1024
|
||||||
|
|
||||||
|
enum InputDataArraysEnum {
|
||||||
|
idaZBuffer = 0,
|
||||||
|
idaNormalEncoded_x,
|
||||||
|
idaNormalEncoded_y,
|
||||||
|
idaSpecularAmount,
|
||||||
|
idaSpecularPower,
|
||||||
|
idaAlbedo_x,
|
||||||
|
idaAlbedo_y,
|
||||||
|
idaAlbedo_z,
|
||||||
|
idaLightPositionView_x,
|
||||||
|
idaLightPositionView_y,
|
||||||
|
idaLightPositionView_z,
|
||||||
|
idaLightAttenuationBegin,
|
||||||
|
idaLightColor_x,
|
||||||
|
idaLightColor_y,
|
||||||
|
idaLightColor_z,
|
||||||
|
idaLightAttenuationEnd,
|
||||||
|
|
||||||
|
idaNum
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifndef ISPC
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "kernels_ispc.h"
|
||||||
|
|
||||||
|
#define ALIGNMENT_BYTES 64
|
||||||
|
|
||||||
|
#define MAX_LIGHTS 1024
|
||||||
|
|
||||||
|
#define VISUALIZE_LIGHT_COUNT 0
|
||||||
|
|
||||||
|
struct InputData
|
||||||
|
{
|
||||||
|
ispc::InputHeader header;
|
||||||
|
ispc::InputDataArrays arrays;
|
||||||
|
uint8_t *chunk;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct Framebuffer {
|
||||||
|
Framebuffer(int width, int height);
|
||||||
|
~Framebuffer();
|
||||||
|
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
uint8_t *r, *g, *b;
|
||||||
|
|
||||||
|
private:
|
||||||
|
int nPixels;
|
||||||
|
Framebuffer(const Framebuffer &);
|
||||||
|
Framebuffer &operator=(const Framebuffer *);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
InputData *CreateInputDataFromFile(const char *path);
|
||||||
|
void DeleteInputData(InputData *input);
|
||||||
|
void WriteFrame(const char *filename, const InputData *input,
|
||||||
|
const Framebuffer &framebuffer);
|
||||||
|
void InitDynamicC(InputData *input);
|
||||||
|
void InitDynamicCilk(InputData *input);
|
||||||
|
void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
|
||||||
|
void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
|
||||||
|
|
||||||
|
#endif // !ISPC
|
||||||
|
|
||||||
|
#endif // DEFERRED_H
|
||||||
170
examples/deferred/deferred_shading.vcxproj
Executable file
170
examples/deferred/deferred_shading.vcxproj
Executable file
@@ -0,0 +1,170 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>Win32</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|Win32">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>Win32</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
|
||||||
|
<Keyword>Win32Proj</Keyword>
|
||||||
|
<RootNamespace>mandelbrot</RootNamespace>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
<CharacterSet>Unicode</CharacterSet>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
<CharacterSet>Unicode</CharacterSet>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<UseDebugLibraries>false</UseDebugLibraries>
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
<CharacterSet>Unicode</CharacterSet>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<UseDebugLibraries>false</UseDebugLibraries>
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
<CharacterSet>Unicode</CharacterSet>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
|
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||||
|
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
|
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||||
|
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
|
<LinkIncremental>true</LinkIncremental>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
|
<LinkIncremental>true</LinkIncremental>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
|
<LinkIncremental>false</LinkIncremental>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
|
<LinkIncremental>false</LinkIncremental>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
|
<ClCompile>
|
||||||
|
<PrecompiledHeader>
|
||||||
|
</PrecompiledHeader>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
</Link>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
|
<ClCompile>
|
||||||
|
<PrecompiledHeader>
|
||||||
|
</PrecompiledHeader>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
</Link>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PrecompiledHeader>
|
||||||
|
</PrecompiledHeader>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||||
|
<OptimizeReferences>true</OptimizeReferences>
|
||||||
|
</Link>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PrecompiledHeader>
|
||||||
|
</PrecompiledHeader>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||||
|
<OptimizeReferences>true</OptimizeReferences>
|
||||||
|
</Link>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<ClCompile Include="common.cpp" />
|
||||||
|
<ClCompile Include="dynamic_c.cpp" />
|
||||||
|
<ClCompile Include="dynamic_cilk.cpp" />
|
||||||
|
<ClCompile Include="main.cpp" />
|
||||||
|
<ClCompile Include="../tasks_concrt.cpp" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CustomBuild Include="kernels.ispc">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
|
</Command>
|
||||||
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
|
</Command>
|
||||||
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
|
</Command>
|
||||||
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
|
</Command>
|
||||||
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
|
</CustomBuild>
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
||||||
871
examples/deferred/dynamic_c.cpp
Normal file
871
examples/deferred/dynamic_c.cpp
Normal file
@@ -0,0 +1,871 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "deferred.h"
|
||||||
|
#include "kernels_ispc.h"
|
||||||
|
#include <algorithm>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif // ISPC_IS_LINUX
|
||||||
|
|
||||||
|
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||||
|
#define MIN_TILE_WIDTH 16
|
||||||
|
#define MIN_TILE_HEIGHT 16
|
||||||
|
|
||||||
|
|
||||||
|
#define DYNAMIC_TREE_LEVELS 5
|
||||||
|
// If this is set to 1 then the result will be identical to the static version
|
||||||
|
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||||
|
|
||||||
|
static void *
|
||||||
|
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
return _aligned_malloc(size, alignment);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
return memalign(alignment, size);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||||
|
char *amem = ((char*)mem) + sizeof(void*);
|
||||||
|
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||||
|
(alignment - 1)));
|
||||||
|
((void**)amem)[-1] = mem;
|
||||||
|
return amem;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
lAlignedFree(void *ptr) {
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
_aligned_free(ptr);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
free(ptr);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
free(((void**)ptr)[-1]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
ComputeZBounds(int tileStartX, int tileEndX,
|
||||||
|
int tileStartY, int tileEndY,
|
||||||
|
// G-buffer data
|
||||||
|
float zBuffer[],
|
||||||
|
int gBufferWidth,
|
||||||
|
// Camera data
|
||||||
|
float cameraProj_33, float cameraProj_43,
|
||||||
|
float cameraNear, float cameraFar,
|
||||||
|
// Output
|
||||||
|
float *minZ, float *maxZ)
|
||||||
|
{
|
||||||
|
// Find Z bounds
|
||||||
|
float laneMinZ = cameraFar;
|
||||||
|
float laneMaxZ = cameraNear;
|
||||||
|
for (int y = tileStartY; y < tileEndY; ++y) {
|
||||||
|
for (int x = tileStartX; x < tileEndX; ++x) {
|
||||||
|
// Unproject depth buffer Z value into view space
|
||||||
|
float z = zBuffer[(y * gBufferWidth + x)];
|
||||||
|
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||||
|
|
||||||
|
// Work out Z bounds for our samples
|
||||||
|
// Avoid considering skybox/background or otherwise invalid pixels
|
||||||
|
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||||
|
laneMinZ = std::min(laneMinZ, viewSpaceZ);
|
||||||
|
laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*minZ = laneMinZ;
|
||||||
|
*maxZ = laneMaxZ;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
||||||
|
int numTilesX, int numTilesY,
|
||||||
|
// G-buffer data
|
||||||
|
float zBuffer[],
|
||||||
|
int gBufferWidth,
|
||||||
|
// Camera data
|
||||||
|
float cameraProj_33, float cameraProj_43,
|
||||||
|
float cameraNear, float cameraFar,
|
||||||
|
// Output
|
||||||
|
float minZArray[],
|
||||||
|
float maxZArray[])
|
||||||
|
{
|
||||||
|
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
||||||
|
float minZ, maxZ;
|
||||||
|
ComputeZBounds(
|
||||||
|
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||||
|
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||||
|
zBuffer, gBufferWidth,
|
||||||
|
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||||
|
&minZ, &maxZ);
|
||||||
|
minZArray[tileX] = minZ;
|
||||||
|
maxZArray[tileX] = maxZ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class MinMaxZTree
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||||
|
// Levels must be small enough that neither dimension goes below one tile
|
||||||
|
MinMaxZTree(
|
||||||
|
int tileWidth, int tileHeight, int levels,
|
||||||
|
int gBufferWidth, int gBufferHeight)
|
||||||
|
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||||
|
{
|
||||||
|
mNumTilesX = gBufferWidth / mTileWidth;
|
||||||
|
mNumTilesY = gBufferHeight / mTileHeight;
|
||||||
|
|
||||||
|
// Allocate arrays
|
||||||
|
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||||
|
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||||
|
for (int i = 0; i < mLevels; ++i) {
|
||||||
|
int x = NumTilesX(i);
|
||||||
|
int y = NumTilesY(i);
|
||||||
|
assert(x > 0);
|
||||||
|
assert(y > 0);
|
||||||
|
// NOTE: If the following two asserts fire it probably means that
|
||||||
|
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||||
|
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||||
|
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||||
|
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||||
|
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||||
|
float cameraProj_33, float cameraProj_43,
|
||||||
|
float cameraNear, float cameraFar)
|
||||||
|
{
|
||||||
|
for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||||
|
ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||||
|
zBuffer, gBufferPitchInElements,
|
||||||
|
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||||
|
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||||
|
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate other levels
|
||||||
|
for (int level = 1; level < mLevels; ++level) {
|
||||||
|
int destTilesX = NumTilesX(level);
|
||||||
|
int destTilesY = NumTilesY(level);
|
||||||
|
int srcLevel = level - 1;
|
||||||
|
int srcTilesX = NumTilesX(srcLevel);
|
||||||
|
int srcTilesY = NumTilesY(srcLevel);
|
||||||
|
for (int y = 0; y < destTilesY; ++y) {
|
||||||
|
for (int x = 0; x < destTilesX; ++x) {
|
||||||
|
int srcX = x << 1;
|
||||||
|
int srcY = y << 1;
|
||||||
|
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||||
|
// TODO: SSE branchless min/max is probably better...
|
||||||
|
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||||
|
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||||
|
if (srcX + 1 < srcTilesX) {
|
||||||
|
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||||
|
(srcX + 1)]);
|
||||||
|
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||||
|
(srcX + 1)]);
|
||||||
|
if (srcY + 1 < srcTilesY) {
|
||||||
|
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||||
|
(srcX + 1)]);
|
||||||
|
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||||
|
(srcX + 1)]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (srcY + 1 < srcTilesY) {
|
||||||
|
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||||
|
(srcX )]);
|
||||||
|
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||||
|
(srcX )]);
|
||||||
|
}
|
||||||
|
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||||
|
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~MinMaxZTree() {
|
||||||
|
for (int i = 0; i < mLevels; ++i) {
|
||||||
|
lAlignedFree(mMinZArrays[i]);
|
||||||
|
lAlignedFree(mMaxZArrays[i]);
|
||||||
|
}
|
||||||
|
lAlignedFree(mMinZArrays);
|
||||||
|
lAlignedFree(mMaxZArrays);
|
||||||
|
}
|
||||||
|
|
||||||
|
int Levels() const { return mLevels; }
|
||||||
|
|
||||||
|
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||||
|
// TODO: Verify this...
|
||||||
|
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||||
|
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||||
|
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||||
|
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||||
|
|
||||||
|
float MinZ(int level, int tileX, int tileY) const {
|
||||||
|
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||||
|
}
|
||||||
|
float MaxZ(int level, int tileX, int tileY) const {
|
||||||
|
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int mTileWidth;
|
||||||
|
int mTileHeight;
|
||||||
|
int mLevels;
|
||||||
|
int mNumTilesX;
|
||||||
|
int mNumTilesY;
|
||||||
|
|
||||||
|
// One array for each "level" in the tree
|
||||||
|
float **mMinZArrays;
|
||||||
|
float **mMaxZArrays;
|
||||||
|
};
|
||||||
|
|
||||||
|
static MinMaxZTree *gMinMaxZTree = 0;
|
||||||
|
|
||||||
|
void InitDynamicC(InputData *input) {
|
||||||
|
gMinMaxZTree =
|
||||||
|
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||||
|
input->header.framebufferWidth,
|
||||||
|
input->header.framebufferHeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||||
|
// should be able to handle programCount-sized load/stores.
|
||||||
|
static void
|
||||||
|
SplitTileMinMax(
|
||||||
|
int tileMidX, int tileMidY,
|
||||||
|
// Subtile data (00, 10, 01, 11)
|
||||||
|
float subtileMinZ[],
|
||||||
|
float subtileMaxZ[],
|
||||||
|
// G-buffer data
|
||||||
|
int gBufferWidth, int gBufferHeight,
|
||||||
|
// Camera data
|
||||||
|
float cameraProj_11, float cameraProj_22,
|
||||||
|
// Light Data
|
||||||
|
int lightIndices[],
|
||||||
|
int numLights,
|
||||||
|
float light_positionView_x_array[],
|
||||||
|
float light_positionView_y_array[],
|
||||||
|
float light_positionView_z_array[],
|
||||||
|
float light_attenuationEnd_array[],
|
||||||
|
// Outputs
|
||||||
|
int subtileIndices[],
|
||||||
|
int subtileIndicesPitch,
|
||||||
|
int subtileNumLights[]
|
||||||
|
)
|
||||||
|
{
|
||||||
|
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||||
|
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||||
|
|
||||||
|
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||||
|
(cameraProj_22 * gBufferScale_y) };
|
||||||
|
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||||
|
tileMidY - gBufferScale_y };
|
||||||
|
|
||||||
|
for (int i = 0; i < 2; ++i) {
|
||||||
|
// Normalize
|
||||||
|
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||||
|
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||||
|
frustumPlanes_xy[i] *= norm;
|
||||||
|
frustumPlanes_z[i] *= norm;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize
|
||||||
|
int subtileLightOffset[4];
|
||||||
|
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||||
|
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||||
|
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||||
|
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||||
|
|
||||||
|
for (int i = 0; i < numLights; ++i) {
|
||||||
|
int lightIndex = lightIndices[i];
|
||||||
|
|
||||||
|
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||||
|
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||||
|
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||||
|
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||||
|
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||||
|
|
||||||
|
// Test lights again subtile z bounds
|
||||||
|
bool inFrustum[4];
|
||||||
|
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||||
|
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
|
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||||
|
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
|
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||||
|
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
|
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||||
|
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||||
|
light_positionView_x * frustumPlanes_xy[0];
|
||||||
|
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||||
|
light_positionView_y * frustumPlanes_xy[1];
|
||||||
|
|
||||||
|
if (fabsf(dx) > light_attenuationEnd) {
|
||||||
|
bool positiveX = dx > 0.0f;
|
||||||
|
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||||
|
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||||
|
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||||
|
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||||
|
}
|
||||||
|
if (fabsf(dy) > light_attenuationEnd) {
|
||||||
|
bool positiveY = dy > 0.0f;
|
||||||
|
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||||
|
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||||
|
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||||
|
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inFrustum[0])
|
||||||
|
subtileIndices[subtileLightOffset[0]++] = lightIndex;
|
||||||
|
if (inFrustum[1])
|
||||||
|
subtileIndices[subtileLightOffset[1]++] = lightIndex;
|
||||||
|
if (inFrustum[2])
|
||||||
|
subtileIndices[subtileLightOffset[2]++] = lightIndex;
|
||||||
|
if (inFrustum[3])
|
||||||
|
subtileIndices[subtileLightOffset[3]++] = lightIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||||
|
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||||
|
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||||
|
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline float
|
||||||
|
dot3(float x, float y, float z, float a, float b, float c) {
|
||||||
|
return (x*a + y*b + z*c);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||||
|
float n = 1.f / sqrtf(x*x + y*y + z*z);
|
||||||
|
ox = x * n;
|
||||||
|
oy = y * n;
|
||||||
|
oz = z * n;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline float
|
||||||
|
Unorm8ToFloat32(uint8_t u) {
|
||||||
|
return (float)u * (1.0f / 255.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline uint8_t
|
||||||
|
Float32ToUnorm8(float f) {
|
||||||
|
return (uint8_t)(f * 255.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline float half_to_float_fast(uint16_t h) {
|
||||||
|
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
||||||
|
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
||||||
|
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||||
|
|
||||||
|
// sign
|
||||||
|
uint32_t xs = ((uint32_t) hs) << 16;
|
||||||
|
// Exponent: unbias the halfp, then bias the single
|
||||||
|
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
|
||||||
|
// Exponent
|
||||||
|
uint32_t xe = (uint32_t) (xes << 23);
|
||||||
|
// Mantissa
|
||||||
|
uint32_t xm = ((uint32_t) hm) << 13;
|
||||||
|
|
||||||
|
uint32_t bits = (xs | xe | xm);
|
||||||
|
float *fp = reinterpret_cast<float *>(&bits);
|
||||||
|
return *fp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
ShadeTileC(
|
||||||
|
int32_t tileStartX, int32_t tileEndX,
|
||||||
|
int32_t tileStartY, int32_t tileEndY,
|
||||||
|
int32_t gBufferWidth, int32_t gBufferHeight,
|
||||||
|
const ispc::InputDataArrays &inputData,
|
||||||
|
// Camera data
|
||||||
|
float cameraProj_11, float cameraProj_22,
|
||||||
|
float cameraProj_33, float cameraProj_43,
|
||||||
|
// Light list
|
||||||
|
int32_t tileLightIndices[],
|
||||||
|
int32_t tileNumLights,
|
||||||
|
// UI
|
||||||
|
bool visualizeLightCount,
|
||||||
|
// Output
|
||||||
|
uint8_t framebuffer_r[],
|
||||||
|
uint8_t framebuffer_g[],
|
||||||
|
uint8_t framebuffer_b[]
|
||||||
|
)
|
||||||
|
{
|
||||||
|
if (tileNumLights == 0 || visualizeLightCount) {
|
||||||
|
uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
|
||||||
|
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||||
|
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||||
|
int32_t framebufferIndex = (y * gBufferWidth + x);
|
||||||
|
framebuffer_r[framebufferIndex] = c;
|
||||||
|
framebuffer_g[framebufferIndex] = c;
|
||||||
|
framebuffer_b[framebufferIndex] = c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||||
|
float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||||
|
|
||||||
|
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||||
|
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||||
|
|
||||||
|
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||||
|
int32_t gBufferOffset = y * gBufferWidth + x;
|
||||||
|
|
||||||
|
// Reconstruct position and (negative) view vector from G-buffer
|
||||||
|
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||||
|
float Vneg_x, Vneg_y, Vneg_z;
|
||||||
|
|
||||||
|
float z = inputData.zBuffer[gBufferOffset];
|
||||||
|
|
||||||
|
// Compute screen/clip-space position
|
||||||
|
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||||
|
float positionScreen_x = (0.5f + (float)(x)) *
|
||||||
|
twoOverGBufferWidth - 1.0f;
|
||||||
|
|
||||||
|
// Unproject depth buffer Z value into view space
|
||||||
|
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||||
|
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||||
|
cameraProj_11;
|
||||||
|
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||||
|
cameraProj_22;
|
||||||
|
|
||||||
|
// We actually end up with a vector pointing *at* the
|
||||||
|
// surface (i.e. the negative view vector)
|
||||||
|
normalize3(surface_positionView_x, surface_positionView_y,
|
||||||
|
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||||
|
|
||||||
|
// Reconstruct normal from G-buffer
|
||||||
|
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||||
|
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||||
|
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||||
|
|
||||||
|
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||||
|
float m = sqrtf(4.0f * f - 1.0f);
|
||||||
|
|
||||||
|
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||||
|
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||||
|
surface_normal_z = 3.0f - 8.0f * f;
|
||||||
|
|
||||||
|
// Load other G-buffer parameters
|
||||||
|
float surface_specularAmount =
|
||||||
|
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||||
|
float surface_specularPower =
|
||||||
|
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||||
|
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||||
|
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||||
|
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||||
|
|
||||||
|
float lit_x = 0.0f;
|
||||||
|
float lit_y = 0.0f;
|
||||||
|
float lit_z = 0.0f;
|
||||||
|
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||||
|
++tileLightIndex) {
|
||||||
|
int32_t lightIndex = tileLightIndices[tileLightIndex];
|
||||||
|
|
||||||
|
// Gather light data relevant to initial culling
|
||||||
|
float light_positionView_x =
|
||||||
|
inputData.lightPositionView_x[lightIndex];
|
||||||
|
float light_positionView_y =
|
||||||
|
inputData.lightPositionView_y[lightIndex];
|
||||||
|
float light_positionView_z =
|
||||||
|
inputData.lightPositionView_z[lightIndex];
|
||||||
|
float light_attenuationEnd =
|
||||||
|
inputData.lightAttenuationEnd[lightIndex];
|
||||||
|
|
||||||
|
// Compute light vector
|
||||||
|
float L_x = light_positionView_x - surface_positionView_x;
|
||||||
|
float L_y = light_positionView_y - surface_positionView_y;
|
||||||
|
float L_z = light_positionView_z - surface_positionView_z;
|
||||||
|
|
||||||
|
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||||
|
|
||||||
|
// Clip at end of attenuation
|
||||||
|
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||||
|
|
||||||
|
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||||
|
float distanceToLight = sqrtf(distanceToLight2);
|
||||||
|
|
||||||
|
float distanceToLightRcp = 1.f / distanceToLight;
|
||||||
|
L_x *= distanceToLightRcp;
|
||||||
|
L_y *= distanceToLightRcp;
|
||||||
|
L_z *= distanceToLightRcp;
|
||||||
|
|
||||||
|
// Start computing brdf
|
||||||
|
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||||
|
surface_normal_z, L_x, L_y, L_z);
|
||||||
|
|
||||||
|
// Clip back facing
|
||||||
|
if (NdotL > 0.0f) {
|
||||||
|
float light_attenuationBegin =
|
||||||
|
inputData.lightAttenuationBegin[lightIndex];
|
||||||
|
|
||||||
|
// Light distance attenuation (linstep)
|
||||||
|
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||||
|
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||||
|
float attenuation = std::min(falloffPosition / lightRange, 1.0f);
|
||||||
|
|
||||||
|
float H_x = (L_x - Vneg_x);
|
||||||
|
float H_y = (L_y - Vneg_y);
|
||||||
|
float H_z = (L_z - Vneg_z);
|
||||||
|
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||||
|
|
||||||
|
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||||
|
surface_normal_z, H_x, H_y, H_z);
|
||||||
|
NdotH = std::max(NdotH, 0.0f);
|
||||||
|
|
||||||
|
float specular = powf(NdotH, surface_specularPower);
|
||||||
|
float specularNorm = (surface_specularPower + 2.0f) *
|
||||||
|
(1.0f / 8.0f);
|
||||||
|
float specularContrib = surface_specularAmount *
|
||||||
|
specularNorm * specular;
|
||||||
|
|
||||||
|
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||||
|
|
||||||
|
float light_color_x = inputData.lightColor_x[lightIndex];
|
||||||
|
float light_color_y = inputData.lightColor_y[lightIndex];
|
||||||
|
float light_color_z = inputData.lightColor_z[lightIndex];
|
||||||
|
|
||||||
|
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||||
|
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||||
|
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||||
|
|
||||||
|
lit_x += lightContrib_x * k;
|
||||||
|
lit_y += lightContrib_y * k;
|
||||||
|
lit_z += lightContrib_z * k;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gamma correct
|
||||||
|
float gamma = 1.0 / 2.2f;
|
||||||
|
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
|
||||||
|
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
|
||||||
|
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
|
||||||
|
|
||||||
|
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||||
|
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||||
|
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||||
|
int *lightIndices, int numLights,
|
||||||
|
Framebuffer *framebuffer) {
|
||||||
|
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||||
|
|
||||||
|
// If we few enough lights or this is the base case (last level), shade
|
||||||
|
// this full tile directly
|
||||||
|
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||||
|
int width = minMaxZTree->TileWidth(level);
|
||||||
|
int height = minMaxZTree->TileHeight(level);
|
||||||
|
int startX = tileX * width;
|
||||||
|
int startY = tileY * height;
|
||||||
|
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||||
|
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||||
|
|
||||||
|
// Skip entirely offscreen tiles
|
||||||
|
if (endX > startX && endY > startY) {
|
||||||
|
ShadeTileC(startX, endX, startY, endY,
|
||||||
|
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||||
|
input->arrays,
|
||||||
|
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||||
|
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||||
|
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||||
|
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||||
|
// Move down a level in the tree
|
||||||
|
--level;
|
||||||
|
tileX <<= 1;
|
||||||
|
tileY <<= 1;
|
||||||
|
int width = minMaxZTree->TileWidth(level);
|
||||||
|
int height = minMaxZTree->TileHeight(level);
|
||||||
|
|
||||||
|
// Work out splitting coords
|
||||||
|
int midX = (tileX + 1) * width;
|
||||||
|
int midY = (tileY + 1) * height;
|
||||||
|
|
||||||
|
// Read subtile min/max data
|
||||||
|
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||||
|
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||||
|
// framebuffer sizes.
|
||||||
|
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||||
|
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||||
|
|
||||||
|
// NOTE: Order is 00, 10, 01, 11
|
||||||
|
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||||
|
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||||
|
input->header.cameraFar, input->header.cameraFar};
|
||||||
|
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||||
|
input->header.cameraNear, input->header.cameraNear};
|
||||||
|
|
||||||
|
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||||
|
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||||
|
if (rightTileExists) {
|
||||||
|
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||||
|
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||||
|
if (bottomTileExists) {
|
||||||
|
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||||
|
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (bottomTileExists) {
|
||||||
|
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||||
|
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cull lights into subtile lists
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
__declspec(align(ALIGNMENT_BYTES))
|
||||||
|
#endif
|
||||||
|
int subtileLightIndices[4][MAX_LIGHTS]
|
||||||
|
#ifndef ISPC_IS_WINDOWS
|
||||||
|
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
int subtileNumLights[4];
|
||||||
|
SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||||
|
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||||
|
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||||
|
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||||
|
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||||
|
input->arrays.lightAttenuationEnd,
|
||||||
|
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||||
|
|
||||||
|
// Recurse into subtiles
|
||||||
|
ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||||
|
subtileLightIndices[0], subtileNumLights[0],
|
||||||
|
framebuffer);
|
||||||
|
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||||
|
subtileLightIndices[1], subtileNumLights[1],
|
||||||
|
framebuffer);
|
||||||
|
ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||||
|
subtileLightIndices[2], subtileNumLights[2],
|
||||||
|
framebuffer);
|
||||||
|
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||||
|
subtileLightIndices[3], subtileNumLights[3],
|
||||||
|
framebuffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int
|
||||||
|
IntersectLightsWithTileMinMax(
|
||||||
|
int tileStartX, int tileEndX,
|
||||||
|
int tileStartY, int tileEndY,
|
||||||
|
// Tile data
|
||||||
|
float minZ,
|
||||||
|
float maxZ,
|
||||||
|
// G-buffer data
|
||||||
|
int gBufferWidth, int gBufferHeight,
|
||||||
|
// Camera data
|
||||||
|
float cameraProj_11, float cameraProj_22,
|
||||||
|
// Light Data
|
||||||
|
int numLights,
|
||||||
|
float light_positionView_x_array[],
|
||||||
|
float light_positionView_y_array[],
|
||||||
|
float light_positionView_z_array[],
|
||||||
|
float light_attenuationEnd_array[],
|
||||||
|
// Output
|
||||||
|
int tileLightIndices[]
|
||||||
|
)
|
||||||
|
{
|
||||||
|
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||||
|
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||||
|
|
||||||
|
float frustumPlanes_xy[4];
|
||||||
|
float frustumPlanes_z[4];
|
||||||
|
|
||||||
|
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||||
|
float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
|
||||||
|
(cameraProj_11 * gBufferScale_x),
|
||||||
|
(cameraProj_22 * gBufferScale_y),
|
||||||
|
-(cameraProj_22 * gBufferScale_y) };
|
||||||
|
|
||||||
|
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
|
||||||
|
-tileStartX + gBufferScale_x,
|
||||||
|
tileEndY - gBufferScale_y,
|
||||||
|
-tileStartY + gBufferScale_y };
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
|
||||||
|
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
|
||||||
|
frustumPlanes_xy_v[i] *= norm;
|
||||||
|
frustumPlanes_z_v[i] *= norm;
|
||||||
|
|
||||||
|
frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
|
||||||
|
frustumPlanes_z[i] = frustumPlanes_z_v[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int tileNumLights = 0;
|
||||||
|
|
||||||
|
for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
|
||||||
|
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||||
|
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||||
|
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||||
|
|
||||||
|
float d = light_positionView_z - minZ;
|
||||||
|
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = maxZ - light_positionView_z;
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
if (!inFrustum)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||||
|
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[0] +
|
||||||
|
light_positionView_x * frustumPlanes_xy[0];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[1] +
|
||||||
|
light_positionView_x * frustumPlanes_xy[1];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[2] +
|
||||||
|
light_positionView_y * frustumPlanes_xy[2];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[3] +
|
||||||
|
light_positionView_y * frustumPlanes_xy[3];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
// Pack and store intersecting lights
|
||||||
|
if (inFrustum)
|
||||||
|
tileLightIndices[tileNumLights++] = lightIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
return tileNumLights;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||||
|
Framebuffer *framebuffer) {
|
||||||
|
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||||
|
|
||||||
|
// Get Z min/max for this tile
|
||||||
|
int width = minMaxZTree->TileWidth(level);
|
||||||
|
int height = minMaxZTree->TileHeight(level);
|
||||||
|
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||||
|
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||||
|
|
||||||
|
int startX = tileX * width;
|
||||||
|
int startY = tileY * height;
|
||||||
|
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||||
|
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||||
|
|
||||||
|
// This is a root tile, so first do a full 6-plane cull
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
__declspec(align(ALIGNMENT_BYTES))
|
||||||
|
#endif
|
||||||
|
int lightIndices[MAX_LIGHTS]
|
||||||
|
#ifndef ISPC_IS_WINDOWS
|
||||||
|
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
int numLights = IntersectLightsWithTileMinMax(
|
||||||
|
startX, endX, startY, endY, minZ, maxZ,
|
||||||
|
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||||
|
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||||
|
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||||
|
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||||
|
input->arrays.lightAttenuationEnd, lightIndices);
|
||||||
|
|
||||||
|
// Now kick off the recursive process for this tile
|
||||||
|
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||||
|
numLights, framebuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
|
||||||
|
{
|
||||||
|
MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||||
|
|
||||||
|
// Update min/max Z tree
|
||||||
|
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||||
|
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||||
|
input->header.cameraNear, input->header.cameraFar);
|
||||||
|
|
||||||
|
int rootLevel = minMaxZTree->Levels() - 1;
|
||||||
|
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||||
|
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||||
|
int rootTiles = rootTilesX * rootTilesY;
|
||||||
|
for (int g = 0; g < rootTiles; ++g) {
|
||||||
|
uint32_t tileY = g / rootTilesX;
|
||||||
|
uint32_t tileX = g % rootTilesX;
|
||||||
|
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
398
examples/deferred/dynamic_cilk.cpp
Normal file
398
examples/deferred/dynamic_cilk.cpp
Normal file
@@ -0,0 +1,398 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef __cilkplusplus
|
||||||
|
|
||||||
|
#include "deferred.h"
|
||||||
|
#include "kernels_ispc.h"
|
||||||
|
#include <algorithm>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif // ISPC_IS_LINUX
|
||||||
|
|
||||||
|
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||||
|
#define MIN_TILE_WIDTH 16
|
||||||
|
#define MIN_TILE_HEIGHT 16
|
||||||
|
|
||||||
|
|
||||||
|
#define DYNAMIC_TREE_LEVELS 5
|
||||||
|
// If this is set to 1 then the result will be identical to the static version
|
||||||
|
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||||
|
|
||||||
|
static void *
|
||||||
|
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
return _aligned_malloc(size, alignment);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
return memalign(alignment, size);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||||
|
char *amem = ((char*)mem) + sizeof(void*);
|
||||||
|
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||||
|
(alignment - 1)));
|
||||||
|
((void**)amem)[-1] = mem;
|
||||||
|
return amem;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
lAlignedFree(void *ptr) {
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
_aligned_free(ptr);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
free(ptr);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
free(((void**)ptr)[-1]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class MinMaxZTreeCilk
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||||
|
// Levels must be small enough that neither dimension goes below one tile
|
||||||
|
MinMaxZTreeCilk(
|
||||||
|
int tileWidth, int tileHeight, int levels,
|
||||||
|
int gBufferWidth, int gBufferHeight)
|
||||||
|
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||||
|
{
|
||||||
|
mNumTilesX = gBufferWidth / mTileWidth;
|
||||||
|
mNumTilesY = gBufferHeight / mTileHeight;
|
||||||
|
|
||||||
|
// Allocate arrays
|
||||||
|
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||||
|
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||||
|
for (int i = 0; i < mLevels; ++i) {
|
||||||
|
int x = NumTilesX(i);
|
||||||
|
int y = NumTilesY(i);
|
||||||
|
assert(x > 0);
|
||||||
|
assert(y > 0);
|
||||||
|
// NOTE: If the following two asserts fire it probably means that
|
||||||
|
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||||
|
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||||
|
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||||
|
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||||
|
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||||
|
float cameraProj_33, float cameraProj_43,
|
||||||
|
float cameraNear, float cameraFar)
|
||||||
|
{
|
||||||
|
// Compute level 0 in parallel. Outer loops is here since we use Cilk
|
||||||
|
_Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||||
|
ispc::ComputeZBoundsRow(tileY,
|
||||||
|
mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||||
|
zBuffer, gBufferPitchInElements,
|
||||||
|
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||||
|
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||||
|
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate other levels
|
||||||
|
// NOTE: We currently don't use ispc here since it's sort of an
|
||||||
|
// awkward gather-based reduction Using SSE odd pack/unpack
|
||||||
|
// instructions might actually work here when we need to optimize
|
||||||
|
for (int level = 1; level < mLevels; ++level) {
|
||||||
|
int destTilesX = NumTilesX(level);
|
||||||
|
int destTilesY = NumTilesY(level);
|
||||||
|
int srcLevel = level - 1;
|
||||||
|
int srcTilesX = NumTilesX(srcLevel);
|
||||||
|
int srcTilesY = NumTilesY(srcLevel);
|
||||||
|
_Cilk_for (int y = 0; y < destTilesY; ++y) {
|
||||||
|
for (int x = 0; x < destTilesX; ++x) {
|
||||||
|
int srcX = x << 1;
|
||||||
|
int srcY = y << 1;
|
||||||
|
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||||
|
// TODO: SSE branchless min/max is probably better...
|
||||||
|
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||||
|
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||||
|
if (srcX + 1 < srcTilesX) {
|
||||||
|
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||||
|
(srcX + 1)]);
|
||||||
|
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||||
|
(srcX + 1)]);
|
||||||
|
if (srcY + 1 < srcTilesY) {
|
||||||
|
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||||
|
(srcX + 1)]);
|
||||||
|
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||||
|
(srcX + 1)]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (srcY + 1 < srcTilesY) {
|
||||||
|
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||||
|
(srcX )]);
|
||||||
|
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||||
|
(srcX )]);
|
||||||
|
}
|
||||||
|
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||||
|
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~MinMaxZTreeCilk() {
|
||||||
|
for (int i = 0; i < mLevels; ++i) {
|
||||||
|
lAlignedFree(mMinZArrays[i]);
|
||||||
|
lAlignedFree(mMaxZArrays[i]);
|
||||||
|
}
|
||||||
|
lAlignedFree(mMinZArrays);
|
||||||
|
lAlignedFree(mMaxZArrays);
|
||||||
|
}
|
||||||
|
|
||||||
|
int Levels() const { return mLevels; }
|
||||||
|
|
||||||
|
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||||
|
// TODO: Verify this...
|
||||||
|
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||||
|
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||||
|
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||||
|
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||||
|
|
||||||
|
float MinZ(int level, int tileX, int tileY) const {
|
||||||
|
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||||
|
}
|
||||||
|
float MaxZ(int level, int tileX, int tileY) const {
|
||||||
|
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int mTileWidth;
|
||||||
|
int mTileHeight;
|
||||||
|
int mLevels;
|
||||||
|
int mNumTilesX;
|
||||||
|
int mNumTilesY;
|
||||||
|
|
||||||
|
// One array for each "level" in the tree
|
||||||
|
float **mMinZArrays;
|
||||||
|
float **mMaxZArrays;
|
||||||
|
};
|
||||||
|
|
||||||
|
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
|
||||||
|
|
||||||
|
void InitDynamicCilk(InputData *input) {
|
||||||
|
gMinMaxZTreeCilk =
|
||||||
|
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||||
|
input->header.framebufferWidth,
|
||||||
|
input->header.framebufferHeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||||
|
int *lightIndices, int numLights,
|
||||||
|
Framebuffer *framebuffer) {
|
||||||
|
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||||
|
|
||||||
|
// If we few enough lights or this is the base case (last level), shade
|
||||||
|
// this full tile directly
|
||||||
|
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||||
|
int width = minMaxZTree->TileWidth(level);
|
||||||
|
int height = minMaxZTree->TileHeight(level);
|
||||||
|
int startX = tileX * width;
|
||||||
|
int startY = tileY * height;
|
||||||
|
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||||
|
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||||
|
|
||||||
|
// Skip entirely offscreen tiles
|
||||||
|
if (endX > startX && endY > startY) {
|
||||||
|
ispc::ShadeTile(
|
||||||
|
startX, endX, startY, endY,
|
||||||
|
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||||
|
&input->arrays,
|
||||||
|
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||||
|
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||||
|
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||||
|
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||||
|
// Move down a level in the tree
|
||||||
|
--level;
|
||||||
|
tileX <<= 1;
|
||||||
|
tileY <<= 1;
|
||||||
|
int width = minMaxZTree->TileWidth(level);
|
||||||
|
int height = minMaxZTree->TileHeight(level);
|
||||||
|
|
||||||
|
// Work out splitting coords
|
||||||
|
int midX = (tileX + 1) * width;
|
||||||
|
int midY = (tileY + 1) * height;
|
||||||
|
|
||||||
|
// Read subtile min/max data
|
||||||
|
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||||
|
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||||
|
// framebuffer sizes.
|
||||||
|
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||||
|
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||||
|
|
||||||
|
// NOTE: Order is 00, 10, 01, 11
|
||||||
|
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||||
|
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||||
|
input->header.cameraFar, input->header.cameraFar};
|
||||||
|
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||||
|
input->header.cameraNear, input->header.cameraNear};
|
||||||
|
|
||||||
|
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||||
|
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||||
|
if (rightTileExists) {
|
||||||
|
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||||
|
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||||
|
if (bottomTileExists) {
|
||||||
|
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||||
|
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (bottomTileExists) {
|
||||||
|
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||||
|
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cull lights into subtile lists
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
__declspec(align(ALIGNMENT_BYTES))
|
||||||
|
#endif
|
||||||
|
int subtileLightIndices[4][MAX_LIGHTS]
|
||||||
|
#ifndef ISPC_IS_WINDOWS
|
||||||
|
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
int subtileNumLights[4];
|
||||||
|
ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||||
|
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||||
|
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||||
|
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||||
|
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||||
|
input->arrays.lightAttenuationEnd,
|
||||||
|
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||||
|
|
||||||
|
// Recurse into subtiles
|
||||||
|
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||||
|
subtileLightIndices[0], subtileNumLights[0],
|
||||||
|
framebuffer);
|
||||||
|
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||||
|
subtileLightIndices[1], subtileNumLights[1],
|
||||||
|
framebuffer);
|
||||||
|
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||||
|
subtileLightIndices[2], subtileNumLights[2],
|
||||||
|
framebuffer);
|
||||||
|
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||||
|
subtileLightIndices[3], subtileNumLights[3],
|
||||||
|
framebuffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||||
|
Framebuffer *framebuffer) {
|
||||||
|
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||||
|
|
||||||
|
// Get Z min/max for this tile
|
||||||
|
int width = minMaxZTree->TileWidth(level);
|
||||||
|
int height = minMaxZTree->TileHeight(level);
|
||||||
|
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||||
|
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||||
|
|
||||||
|
int startX = tileX * width;
|
||||||
|
int startY = tileY * height;
|
||||||
|
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||||
|
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||||
|
|
||||||
|
// This is a root tile, so first do a full 6-plane cull
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
__declspec(align(ALIGNMENT_BYTES))
|
||||||
|
#endif
|
||||||
|
int lightIndices[MAX_LIGHTS]
|
||||||
|
#ifndef ISPC_IS_WINDOWS
|
||||||
|
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
int numLights = ispc::IntersectLightsWithTileMinMax(
|
||||||
|
startX, endX, startY, endY, minZ, maxZ,
|
||||||
|
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||||
|
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||||
|
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||||
|
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||||
|
input->arrays.lightAttenuationEnd, lightIndices);
|
||||||
|
|
||||||
|
// Now kick off the recursive process for this tile
|
||||||
|
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||||
|
numLights, framebuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
||||||
|
{
|
||||||
|
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||||
|
|
||||||
|
// Update min/max Z tree
|
||||||
|
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||||
|
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||||
|
input->header.cameraNear, input->header.cameraFar);
|
||||||
|
|
||||||
|
// Launch the "root" tiles. Ideally these should at least fill the
|
||||||
|
// machine... at the moment we have a static number of "levels" to the
|
||||||
|
// mip tree but it might make sense to compute it based on the width of
|
||||||
|
// the machine.
|
||||||
|
int rootLevel = minMaxZTree->Levels() - 1;
|
||||||
|
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||||
|
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||||
|
int rootTiles = rootTilesX * rootTilesY;
|
||||||
|
_Cilk_for (int g = 0; g < rootTiles; ++g) {
|
||||||
|
uint32_t tileY = g / rootTilesX;
|
||||||
|
uint32_t tileX = g % rootTilesX;
|
||||||
|
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __cilkplusplus
|
||||||
717
examples/deferred/kernels.ispc
Normal file
717
examples/deferred/kernels.ispc
Normal file
@@ -0,0 +1,717 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "deferred.h"
|
||||||
|
|
||||||
|
struct InputDataArrays
|
||||||
|
{
|
||||||
|
uniform float zBuffer[];
|
||||||
|
uniform unsigned int16 normalEncoded_x[]; // half float
|
||||||
|
uniform unsigned int16 normalEncoded_y[]; // half float
|
||||||
|
uniform unsigned int16 specularAmount[]; // half float
|
||||||
|
uniform unsigned int16 specularPower[]; // half float
|
||||||
|
uniform unsigned int8 albedo_x[]; // unorm8
|
||||||
|
uniform unsigned int8 albedo_y[]; // unorm8
|
||||||
|
uniform unsigned int8 albedo_z[]; // unorm8
|
||||||
|
uniform float lightPositionView_x[];
|
||||||
|
uniform float lightPositionView_y[];
|
||||||
|
uniform float lightPositionView_z[];
|
||||||
|
uniform float lightAttenuationBegin[];
|
||||||
|
uniform float lightColor_x[];
|
||||||
|
uniform float lightColor_y[];
|
||||||
|
uniform float lightColor_z[];
|
||||||
|
uniform float lightAttenuationEnd[];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct InputHeader
|
||||||
|
{
|
||||||
|
uniform float cameraProj[4][4];
|
||||||
|
uniform float cameraNear;
|
||||||
|
uniform float cameraFar;
|
||||||
|
|
||||||
|
uniform int32 framebufferWidth;
|
||||||
|
uniform int32 framebufferHeight;
|
||||||
|
uniform int32 numLights;
|
||||||
|
uniform int32 inputDataChunkSize;
|
||||||
|
uniform int32 inputDataArrayOffsets[idaNum];
|
||||||
|
};
|
||||||
|
|
||||||
|
export void foo(reference InputHeader h) { }
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Common utility routines
|
||||||
|
|
||||||
|
static inline float
|
||||||
|
dot3(float x, float y, float z, float a, float b, float c) {
|
||||||
|
return (x*a + y*b + z*c);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
normalize3(float x, float y, float z, reference float ox,
|
||||||
|
reference float oy, reference float oz) {
|
||||||
|
float n = rsqrt(x*x + y*y + z*z);
|
||||||
|
ox = x * n;
|
||||||
|
oy = y * n;
|
||||||
|
oz = z * n;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline float
|
||||||
|
Unorm8ToFloat32(unsigned int8 u) {
|
||||||
|
return (float)u * (1.0f / 255.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline unsigned int8
|
||||||
|
Float32ToUnorm8(float f) {
|
||||||
|
return (unsigned int8)(f * 255.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
|
static void
|
||||||
|
ComputeZBounds(
|
||||||
|
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||||
|
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||||
|
// G-buffer data
|
||||||
|
uniform float zBuffer[],
|
||||||
|
uniform int32 gBufferWidth,
|
||||||
|
// Camera data
|
||||||
|
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||||
|
uniform float cameraNear, uniform float cameraFar,
|
||||||
|
// Output
|
||||||
|
reference uniform float minZ,
|
||||||
|
reference uniform float maxZ
|
||||||
|
)
|
||||||
|
{
|
||||||
|
// Find Z bounds
|
||||||
|
float laneMinZ = cameraFar;
|
||||||
|
float laneMaxZ = cameraNear;
|
||||||
|
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||||
|
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||||
|
// Unproject depth buffer Z value into view space
|
||||||
|
float z = zBuffer[(y * gBufferWidth + x) + programIndex];
|
||||||
|
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||||
|
|
||||||
|
// Work out Z bounds for our samples
|
||||||
|
// Avoid considering skybox/background or otherwise invalid pixels
|
||||||
|
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||||
|
laneMinZ = min(laneMinZ, viewSpaceZ);
|
||||||
|
laneMaxZ = max(laneMaxZ, viewSpaceZ);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
minZ = reduce_min(laneMinZ);
|
||||||
|
maxZ = reduce_max(laneMaxZ);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
|
// numLights must currently be a multiple of programCount (SIMD size)
|
||||||
|
export uniform int32
|
||||||
|
IntersectLightsWithTileMinMax(
|
||||||
|
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||||
|
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||||
|
// Tile data
|
||||||
|
uniform float minZ,
|
||||||
|
uniform float maxZ,
|
||||||
|
// G-buffer data
|
||||||
|
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||||
|
// Camera data
|
||||||
|
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||||
|
// Light Data
|
||||||
|
uniform int32 numLights,
|
||||||
|
uniform float light_positionView_x_array[],
|
||||||
|
uniform float light_positionView_y_array[],
|
||||||
|
uniform float light_positionView_z_array[],
|
||||||
|
uniform float light_attenuationEnd_array[],
|
||||||
|
// Output
|
||||||
|
reference uniform int32 tileLightIndices[]
|
||||||
|
)
|
||||||
|
{
|
||||||
|
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||||
|
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||||
|
|
||||||
|
// Parallize across frustum planes.
|
||||||
|
// We really only have four side planes here, but write the code to
|
||||||
|
// handle programCount > 4 robustly
|
||||||
|
uniform float frustumPlanes_xy[programCount];
|
||||||
|
uniform float frustumPlanes_z[programCount];
|
||||||
|
|
||||||
|
// TODO: If programIndex < 4 here? Don't care about masking off the
|
||||||
|
// rest but if interleaving ("x2" modes) the other lanes should ideally
|
||||||
|
// not be emitted...
|
||||||
|
{
|
||||||
|
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||||
|
float frustumPlanes_xy_v;
|
||||||
|
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||||
|
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_11 * gBufferScale_x));
|
||||||
|
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2, (cameraProj_22 * gBufferScale_y));
|
||||||
|
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
|
||||||
|
|
||||||
|
float frustumPlanes_z_v;
|
||||||
|
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileEndX - gBufferScale_x);
|
||||||
|
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
|
||||||
|
frustumPlanes_z_v = insert(frustumPlanes_z_v, 2, tileEndY - gBufferScale_y);
|
||||||
|
frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
|
||||||
|
|
||||||
|
// Normalize
|
||||||
|
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||||
|
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||||
|
frustumPlanes_xy_v *= norm;
|
||||||
|
frustumPlanes_z_v *= norm;
|
||||||
|
|
||||||
|
// Save out for uniform use later
|
||||||
|
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||||
|
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||||
|
}
|
||||||
|
|
||||||
|
uniform int32 tileNumLights = 0;
|
||||||
|
|
||||||
|
for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights;
|
||||||
|
baseLightIndex += programCount) {
|
||||||
|
int32 lightIndex = baseLightIndex + programIndex;
|
||||||
|
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||||
|
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||||
|
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||||
|
|
||||||
|
float d = light_positionView_z - minZ;
|
||||||
|
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = maxZ - light_positionView_z;
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
// This seems better than cif(!inFrustum) ccontinue; here since we
|
||||||
|
// don't actually need to mask the rest of this function - this is
|
||||||
|
// just a greedy early-out. Could also structure all of this as
|
||||||
|
// nested if() statements, but this a bit easier to read
|
||||||
|
if (!any(inFrustum))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||||
|
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[0] +
|
||||||
|
light_positionView_x * frustumPlanes_xy[0];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[1] +
|
||||||
|
light_positionView_x * frustumPlanes_xy[1];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[2] +
|
||||||
|
light_positionView_y * frustumPlanes_xy[2];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[3] +
|
||||||
|
light_positionView_y * frustumPlanes_xy[3];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
// Pack and store intersecting lights
|
||||||
|
cif (inFrustum) {
|
||||||
|
tileNumLights += packed_store_active(tileLightIndices, tileNumLights,
|
||||||
|
lightIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tileNumLights;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
|
// numLights must currently be a multiple of programCount (SIMD size)
|
||||||
|
static uniform int32
|
||||||
|
IntersectLightsWithTile(
|
||||||
|
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||||
|
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||||
|
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||||
|
// G-buffer data
|
||||||
|
uniform float zBuffer[],
|
||||||
|
// Camera data
|
||||||
|
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||||
|
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||||
|
uniform float cameraNear, uniform float cameraFar,
|
||||||
|
// Light Data
|
||||||
|
uniform int32 numLights,
|
||||||
|
uniform float light_positionView_x_array[],
|
||||||
|
uniform float light_positionView_y_array[],
|
||||||
|
uniform float light_positionView_z_array[],
|
||||||
|
uniform float light_attenuationEnd_array[],
|
||||||
|
// Output
|
||||||
|
reference uniform int32 tileLightIndices[]
|
||||||
|
)
|
||||||
|
{
|
||||||
|
uniform float minZ, maxZ;
|
||||||
|
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
|
||||||
|
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||||
|
minZ, maxZ);
|
||||||
|
|
||||||
|
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
|
||||||
|
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
|
||||||
|
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
|
||||||
|
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||||
|
light_positionView_z_array, light_attenuationEnd_array,
|
||||||
|
tileLightIndices);
|
||||||
|
|
||||||
|
return tileNumLights;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
|
export void
|
||||||
|
ShadeTile(
|
||||||
|
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||||
|
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||||
|
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||||
|
reference uniform InputDataArrays inputData,
|
||||||
|
// Camera data
|
||||||
|
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||||
|
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||||
|
// Light list
|
||||||
|
reference uniform int32 tileLightIndices[],
|
||||||
|
uniform int32 tileNumLights,
|
||||||
|
// UI
|
||||||
|
uniform bool visualizeLightCount,
|
||||||
|
// Output
|
||||||
|
reference uniform unsigned int8 framebuffer_r[],
|
||||||
|
reference uniform unsigned int8 framebuffer_g[],
|
||||||
|
reference uniform unsigned int8 framebuffer_b[]
|
||||||
|
)
|
||||||
|
{
|
||||||
|
if (tileNumLights == 0 || visualizeLightCount) {
|
||||||
|
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||||
|
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||||
|
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||||
|
int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
|
||||||
|
framebuffer_r[framebufferIndex] = c;
|
||||||
|
framebuffer_g[framebufferIndex] = c;
|
||||||
|
framebuffer_b[framebufferIndex] = c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||||
|
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||||
|
|
||||||
|
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||||
|
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||||
|
|
||||||
|
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||||
|
uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
|
||||||
|
int32 gBufferOffset = gBufferOffsetBase + programIndex;
|
||||||
|
|
||||||
|
// Reconstruct position and (negative) view vector from G-buffer
|
||||||
|
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||||
|
float Vneg_x, Vneg_y, Vneg_z;
|
||||||
|
|
||||||
|
float z = inputData.zBuffer[gBufferOffset];
|
||||||
|
|
||||||
|
// Compute screen/clip-space position
|
||||||
|
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||||
|
float positionScreen_x = (0.5f + (float)(x + programIndex)) *
|
||||||
|
twoOverGBufferWidth - 1.0f;
|
||||||
|
|
||||||
|
// Unproject depth buffer Z value into view space
|
||||||
|
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||||
|
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||||
|
cameraProj_11;
|
||||||
|
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||||
|
cameraProj_22;
|
||||||
|
|
||||||
|
// We actually end up with a vector pointing *at* the
|
||||||
|
// surface (i.e. the negative view vector)
|
||||||
|
normalize3(surface_positionView_x, surface_positionView_y,
|
||||||
|
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||||
|
|
||||||
|
// Reconstruct normal from G-buffer
|
||||||
|
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||||
|
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||||
|
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||||
|
|
||||||
|
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||||
|
float m = sqrt(4.0f * f - 1.0f);
|
||||||
|
|
||||||
|
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||||
|
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||||
|
surface_normal_z = 3.0f - 8.0f * f;
|
||||||
|
|
||||||
|
// Load other G-buffer parameters
|
||||||
|
float surface_specularAmount =
|
||||||
|
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||||
|
float surface_specularPower =
|
||||||
|
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||||
|
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||||
|
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||||
|
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||||
|
|
||||||
|
float lit_x = 0.0f;
|
||||||
|
float lit_y = 0.0f;
|
||||||
|
float lit_z = 0.0f;
|
||||||
|
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||||
|
++tileLightIndex) {
|
||||||
|
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
|
||||||
|
|
||||||
|
// Gather light data relevant to initial culling
|
||||||
|
uniform float light_positionView_x =
|
||||||
|
inputData.lightPositionView_x[lightIndex];
|
||||||
|
uniform float light_positionView_y =
|
||||||
|
inputData.lightPositionView_y[lightIndex];
|
||||||
|
uniform float light_positionView_z =
|
||||||
|
inputData.lightPositionView_z[lightIndex];
|
||||||
|
uniform float light_attenuationEnd =
|
||||||
|
inputData.lightAttenuationEnd[lightIndex];
|
||||||
|
|
||||||
|
// Compute light vector
|
||||||
|
float L_x = light_positionView_x - surface_positionView_x;
|
||||||
|
float L_y = light_positionView_y - surface_positionView_y;
|
||||||
|
float L_z = light_positionView_z - surface_positionView_z;
|
||||||
|
|
||||||
|
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||||
|
|
||||||
|
// Clip at end of attenuation
|
||||||
|
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||||
|
|
||||||
|
cif (distanceToLight2 < light_attenutaionEnd2) {
|
||||||
|
float distanceToLight = sqrt(distanceToLight2);
|
||||||
|
|
||||||
|
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||||
|
float distanceToLightRcp = rcp(distanceToLight);
|
||||||
|
L_x *= distanceToLightRcp;
|
||||||
|
L_y *= distanceToLightRcp;
|
||||||
|
L_z *= distanceToLightRcp;
|
||||||
|
|
||||||
|
// Start computing brdf
|
||||||
|
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||||
|
surface_normal_z, L_x, L_y, L_z);
|
||||||
|
|
||||||
|
// Clip back facing
|
||||||
|
cif (NdotL > 0.0f) {
|
||||||
|
uniform float light_attenuationBegin =
|
||||||
|
inputData.lightAttenuationBegin[lightIndex];
|
||||||
|
|
||||||
|
// Light distance attenuation (linstep)
|
||||||
|
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||||
|
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||||
|
float attenuation = min(falloffPosition / lightRange, 1.0f);
|
||||||
|
|
||||||
|
float H_x = (L_x - Vneg_x);
|
||||||
|
float H_y = (L_y - Vneg_y);
|
||||||
|
float H_z = (L_z - Vneg_z);
|
||||||
|
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||||
|
|
||||||
|
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||||
|
surface_normal_z, H_x, H_y, H_z);
|
||||||
|
NdotH = max(NdotH, 0.0f);
|
||||||
|
|
||||||
|
float specular = pow(NdotH, surface_specularPower);
|
||||||
|
float specularNorm = (surface_specularPower + 2.0f) *
|
||||||
|
(1.0f / 8.0f);
|
||||||
|
float specularContrib = surface_specularAmount *
|
||||||
|
specularNorm * specular;
|
||||||
|
|
||||||
|
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||||
|
|
||||||
|
uniform float light_color_x = inputData.lightColor_x[lightIndex];
|
||||||
|
uniform float light_color_y = inputData.lightColor_y[lightIndex];
|
||||||
|
uniform float light_color_z = inputData.lightColor_z[lightIndex];
|
||||||
|
|
||||||
|
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||||
|
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||||
|
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||||
|
|
||||||
|
lit_x += lightContrib_x * k;
|
||||||
|
lit_y += lightContrib_y * k;
|
||||||
|
lit_z += lightContrib_z * k;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gamma correct
|
||||||
|
// These pows are pretty slow right now, but we can do
|
||||||
|
// something faster if really necessary to squeeze every
|
||||||
|
// last bit of performance out of it
|
||||||
|
float gamma = 1.0 / 2.2f;
|
||||||
|
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
|
||||||
|
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
|
||||||
|
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
|
||||||
|
|
||||||
|
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||||
|
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||||
|
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Static decomposition
|
||||||
|
|
||||||
|
task void
|
||||||
|
RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
|
||||||
|
reference uniform InputHeader inputHeader,
|
||||||
|
reference uniform InputDataArrays inputData,
|
||||||
|
uniform int visualizeLightCount,
|
||||||
|
// Output
|
||||||
|
reference uniform unsigned int8 framebuffer_r[],
|
||||||
|
reference uniform unsigned int8 framebuffer_g[],
|
||||||
|
reference uniform unsigned int8 framebuffer_b[]) {
|
||||||
|
uniform int32 group_y = g / num_groups_x;
|
||||||
|
uniform int32 group_x = g % num_groups_x;
|
||||||
|
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||||
|
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||||
|
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||||
|
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||||
|
|
||||||
|
uniform int sTileNumLights = 0;
|
||||||
|
uniform int sTileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||||
|
|
||||||
|
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
||||||
|
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
||||||
|
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||||
|
uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
|
||||||
|
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||||
|
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||||
|
|
||||||
|
// Light intersection
|
||||||
|
sTileNumLights =
|
||||||
|
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||||
|
tile_start_y, tile_end_y,
|
||||||
|
framebufferWidth, framebufferHeight,
|
||||||
|
inputData.zBuffer,
|
||||||
|
cameraProj_00, cameraProj_11,
|
||||||
|
cameraProj_22, cameraProj_32,
|
||||||
|
inputHeader.cameraNear, inputHeader.cameraFar,
|
||||||
|
MAX_LIGHTS,
|
||||||
|
inputData.lightPositionView_x,
|
||||||
|
inputData.lightPositionView_y,
|
||||||
|
inputData.lightPositionView_z,
|
||||||
|
inputData.lightAttenuationEnd,
|
||||||
|
sTileLightIndices);
|
||||||
|
|
||||||
|
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||||
|
framebufferWidth, framebufferHeight, inputData,
|
||||||
|
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||||
|
sTileLightIndices, sTileNumLights, visualizeLightCount,
|
||||||
|
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export void
|
||||||
|
RenderStatic(reference uniform InputHeader inputHeader,
|
||||||
|
reference uniform InputDataArrays inputData,
|
||||||
|
uniform int visualizeLightCount,
|
||||||
|
// Output
|
||||||
|
reference uniform unsigned int8 framebuffer_r[],
|
||||||
|
reference uniform unsigned int8 framebuffer_g[],
|
||||||
|
reference uniform unsigned int8 framebuffer_b[]) {
|
||||||
|
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||||
|
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||||
|
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||||
|
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||||
|
uniform int num_groups = num_groups_x * num_groups_y;
|
||||||
|
|
||||||
|
for (uniform int g = 0; g < num_groups; ++g)
|
||||||
|
launch < RenderTile(g, num_groups_x, num_groups_y,
|
||||||
|
inputHeader, inputData, visualizeLightCount,
|
||||||
|
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Routines for dynamic decomposition path
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
|
export void
|
||||||
|
ComputeZBoundsRow(
|
||||||
|
uniform int32 tileY,
|
||||||
|
uniform int32 tileWidth, uniform int32 tileHeight,
|
||||||
|
uniform int32 numTilesX, uniform int32 numTilesY,
|
||||||
|
// G-buffer data
|
||||||
|
uniform float zBuffer[],
|
||||||
|
uniform int32 gBufferWidth,
|
||||||
|
// Camera data
|
||||||
|
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||||
|
uniform float cameraNear, uniform float cameraFar,
|
||||||
|
// Output
|
||||||
|
reference uniform float minZArray[],
|
||||||
|
reference uniform float maxZArray[]
|
||||||
|
)
|
||||||
|
{
|
||||||
|
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
|
||||||
|
uniform float minZ, maxZ;
|
||||||
|
ComputeZBounds(
|
||||||
|
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||||
|
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||||
|
zBuffer, gBufferWidth,
|
||||||
|
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||||
|
minZ, maxZ);
|
||||||
|
minZArray[tileX] = minZ;
|
||||||
|
maxZArray[tileX] = maxZ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||||
|
// should be able to handle programCount-sized load/stores.
|
||||||
|
export void
|
||||||
|
SplitTileMinMax(
|
||||||
|
uniform int32 tileMidX, uniform int32 tileMidY,
|
||||||
|
// Subtile data (00, 10, 01, 11)
|
||||||
|
uniform float subtileMinZ[],
|
||||||
|
uniform float subtileMaxZ[],
|
||||||
|
// G-buffer data
|
||||||
|
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||||
|
// Camera data
|
||||||
|
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||||
|
// Light Data
|
||||||
|
reference uniform int32 lightIndices[],
|
||||||
|
uniform int32 numLights,
|
||||||
|
uniform float light_positionView_x_array[],
|
||||||
|
uniform float light_positionView_y_array[],
|
||||||
|
uniform float light_positionView_z_array[],
|
||||||
|
uniform float light_attenuationEnd_array[],
|
||||||
|
// Outputs
|
||||||
|
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
|
||||||
|
// indexing math ourselves
|
||||||
|
reference uniform int32 subtileIndices[],
|
||||||
|
uniform int32 subtileIndicesPitch,
|
||||||
|
reference uniform int32 subtileNumLights[]
|
||||||
|
)
|
||||||
|
{
|
||||||
|
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||||
|
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||||
|
|
||||||
|
// Parallize across frustum planes
|
||||||
|
// Only have 2 frustum split planes here so may not be worth it, but
|
||||||
|
// we'll do it for now for consistency
|
||||||
|
uniform float frustumPlanes_xy[programCount];
|
||||||
|
uniform float frustumPlanes_z[programCount];
|
||||||
|
|
||||||
|
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||||
|
float frustumPlanes_xy_v;
|
||||||
|
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||||
|
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_22 * gBufferScale_y));
|
||||||
|
|
||||||
|
float frustumPlanes_z_v;
|
||||||
|
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
|
||||||
|
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
|
||||||
|
|
||||||
|
// Normalize
|
||||||
|
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||||
|
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||||
|
frustumPlanes_xy_v *= norm;
|
||||||
|
frustumPlanes_z_v *= norm;
|
||||||
|
|
||||||
|
// Save out for uniform use later
|
||||||
|
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||||
|
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||||
|
|
||||||
|
// Initialize
|
||||||
|
uniform int32 subtileLightOffset[4];
|
||||||
|
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||||
|
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||||
|
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||||
|
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||||
|
|
||||||
|
for (int32 i = programIndex; i < numLights; i += programCount) {
|
||||||
|
// TODO: ISPC says gather required here when it actually
|
||||||
|
// isn't... this could be fixed this by nesting an if() within a
|
||||||
|
// uniform loop, but I'm not totally sure if that's a win
|
||||||
|
// overall. For now we'll just eat the perf cost for cleanliness
|
||||||
|
// since the below are real gathers anyways.
|
||||||
|
int32 lightIndex = lightIndices[i];
|
||||||
|
|
||||||
|
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||||
|
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||||
|
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||||
|
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||||
|
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||||
|
|
||||||
|
// Test lights again subtile z bounds
|
||||||
|
bool inFrustum[4];
|
||||||
|
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||||
|
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
|
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||||
|
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
|
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||||
|
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
|
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||||
|
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||||
|
light_positionView_x * frustumPlanes_xy[0];
|
||||||
|
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||||
|
light_positionView_y * frustumPlanes_xy[1];
|
||||||
|
|
||||||
|
cif (abs(dx) > light_attenuationEnd) {
|
||||||
|
bool positiveX = dx > 0.0f;
|
||||||
|
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||||
|
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||||
|
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||||
|
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||||
|
}
|
||||||
|
cif (abs(dy) > light_attenuationEnd) {
|
||||||
|
bool positiveY = dy > 0.0f;
|
||||||
|
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||||
|
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||||
|
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||||
|
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pack and store intersecting lights
|
||||||
|
// TODO: Experiment with a loop here instead
|
||||||
|
cif (inFrustum[0])
|
||||||
|
subtileLightOffset[0] += packed_store_active(subtileIndices,
|
||||||
|
subtileLightOffset[0],
|
||||||
|
lightIndex);
|
||||||
|
cif (inFrustum[1])
|
||||||
|
subtileLightOffset[1] += packed_store_active(subtileIndices,
|
||||||
|
subtileLightOffset[1],
|
||||||
|
lightIndex);
|
||||||
|
cif (inFrustum[2])
|
||||||
|
subtileLightOffset[2] += packed_store_active(subtileIndices,
|
||||||
|
subtileLightOffset[2],
|
||||||
|
lightIndex);
|
||||||
|
cif (inFrustum[3])
|
||||||
|
subtileLightOffset[3] += packed_store_active(subtileIndices,
|
||||||
|
subtileLightOffset[3],
|
||||||
|
lightIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||||
|
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||||
|
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||||
|
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||||
|
}
|
||||||
137
examples/deferred/main.cpp
Normal file
137
examples/deferred/main.cpp
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#define NOMINMAX
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <float.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <vector>
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
#include "deferred.h"
|
||||||
|
#include "kernels_ispc.h"
|
||||||
|
#include "../timing.h"
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
if (argc != 2) {
|
||||||
|
printf("usage: deferred_shading <input_file>\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
InputData *input = CreateInputDataFromFile(argv[1]);
|
||||||
|
if (!input) {
|
||||||
|
printf("Failed to load input file \"%s\"!\n", argv[1]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
Framebuffer framebuffer(input->header.framebufferWidth,
|
||||||
|
input->header.framebufferHeight);
|
||||||
|
|
||||||
|
InitDynamicC(input);
|
||||||
|
#ifdef __cilkplusplus
|
||||||
|
InitDynamicCilk(input);
|
||||||
|
#endif // __cilkplusplus
|
||||||
|
|
||||||
|
int nframes = 5;
|
||||||
|
double ispcCycles = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
framebuffer.clear();
|
||||||
|
reset_and_start_timer();
|
||||||
|
for (int j = 0; j < nframes; ++j)
|
||||||
|
ispc::RenderStatic(&input->header, &input->arrays,
|
||||||
|
VISUALIZE_LIGHT_COUNT,
|
||||||
|
framebuffer.r, framebuffer.g, framebuffer.b);
|
||||||
|
double mcycles = get_elapsed_mcycles() / nframes;
|
||||||
|
ispcCycles = std::min(ispcCycles, mcycles);
|
||||||
|
}
|
||||||
|
printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
|
||||||
|
"%d x %d image\n", ispcCycles,
|
||||||
|
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||||
|
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||||
|
|
||||||
|
double serialCycles = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
framebuffer.clear();
|
||||||
|
reset_and_start_timer();
|
||||||
|
for (int j = 0; j < nframes; ++j)
|
||||||
|
DispatchDynamicC(input, &framebuffer);
|
||||||
|
double mcycles = get_elapsed_mcycles() / nframes;
|
||||||
|
serialCycles = std::min(serialCycles, mcycles);
|
||||||
|
}
|
||||||
|
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n",
|
||||||
|
serialCycles);
|
||||||
|
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
||||||
|
|
||||||
|
#ifdef __cilkplusplus
|
||||||
|
double dynamicCilkCycles = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
framebuffer.clear();
|
||||||
|
reset_and_start_timer();
|
||||||
|
for (int j = 0; j < nframes; ++j)
|
||||||
|
DispatchDynamicCilk(input, &framebuffer);
|
||||||
|
double mcycles = get_elapsed_mcycles() / nframes;
|
||||||
|
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||||
|
}
|
||||||
|
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n",
|
||||||
|
dynamicCilkCycles);
|
||||||
|
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||||
|
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
||||||
|
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
||||||
|
#else
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
||||||
|
#endif // __cilkplusplus
|
||||||
|
|
||||||
|
DeleteInputData(input);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@@ -18,8 +18,11 @@ EndProject
|
|||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
|
||||||
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Win32 = Debug|Win32
|
Debug|Win32 = Debug|Win32
|
||||||
@@ -108,6 +111,14 @@ Global
|
|||||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
|
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
|
||||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
|
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
|
||||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
|
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
|
||||||
|
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||||
|
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
|
||||||
|
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
|
||||||
|
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
|
||||||
|
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
HideSolutionNode = FALSE
|
HideSolutionNode = FALSE
|
||||||
|
|||||||
0
examples/mandelbrot/mandelbrot.vcxproj
Executable file → Normal file
0
examples/mandelbrot/mandelbrot.vcxproj
Executable file → Normal file
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
|
|||||||
float z_re = c_re, z_im = c_im;
|
float z_re = c_re, z_im = c_im;
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < count; ++i) {
|
for (i = 0; i < count; ++i) {
|
||||||
if (z_re * z_re + z_im * z_im > 4.)
|
if (z_re * z_re + z_im * z_im > 4.f)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
float new_re = z_re*z_re - z_im*z_im;
|
float new_re = z_re*z_re - z_im*z_im;
|
||||||
|
|||||||
@@ -1,14 +1,8 @@
|
|||||||
|
|
||||||
ARCH = $(shell uname)
|
ARCH = $(shell uname)
|
||||||
|
|
||||||
TASK_CXX=../tasks_pthreads.cpp
|
TASK_CXX=../tasksys.cpp
|
||||||
TASK_LIB=-lpthread
|
TASK_LIB=-lpthread
|
||||||
|
|
||||||
ifeq ($(ARCH), Darwin)
|
|
||||||
TASK_CXX=../tasks_gcd.cpp
|
|
||||||
TASK_LIB=
|
|
||||||
endif
|
|
||||||
|
|
||||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||||
|
|
||||||
CXX=g++
|
CXX=g++
|
||||||
|
|||||||
@@ -40,6 +40,7 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <string.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
#include "../cpuid.h"
|
#include "../cpuid.h"
|
||||||
#include "mandelbrot_ispc.h"
|
#include "mandelbrot_ispc.h"
|
||||||
@@ -99,8 +100,12 @@ ensureTargetISAIsSupported() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void usage() {
|
||||||
|
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
int main() {
|
int main(int argc, char *argv[]) {
|
||||||
unsigned int width = 1536;
|
unsigned int width = 1536;
|
||||||
unsigned int height = 1024;
|
unsigned int height = 1024;
|
||||||
float x0 = -2;
|
float x0 = -2;
|
||||||
@@ -108,6 +113,25 @@ int main() {
|
|||||||
float y0 = -1;
|
float y0 = -1;
|
||||||
float y1 = 1;
|
float y1 = 1;
|
||||||
|
|
||||||
|
if (argc == 1)
|
||||||
|
;
|
||||||
|
else if (argc == 2) {
|
||||||
|
if (strncmp(argv[1], "--scale=", 8) == 0) {
|
||||||
|
float scale = atof(argv[1] + 8);
|
||||||
|
if (scale == 0.f)
|
||||||
|
usage();
|
||||||
|
width *= scale;
|
||||||
|
height *= scale;
|
||||||
|
// round up to multiples of 16
|
||||||
|
width = (width + 0xf) & ~0xf;
|
||||||
|
height = (height + 0xf) & ~0xf;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
usage();
|
||||||
|
|
||||||
ensureTargetISAIsSupported();
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
int maxIterations = 512;
|
int maxIterations = 512;
|
||||||
@@ -119,6 +143,9 @@ int main() {
|
|||||||
//
|
//
|
||||||
double minISPC = 1e30;
|
double minISPC = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
// Clear out the buffer
|
||||||
|
for (unsigned int i = 0; i < width * height; ++i)
|
||||||
|
buf[i] = 0;
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
@@ -128,9 +155,6 @@ int main() {
|
|||||||
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
||||||
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
||||||
|
|
||||||
// Clear out the buffer
|
|
||||||
for (unsigned int i = 0; i < width * height; ++i)
|
|
||||||
buf[i] = 0;
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// And run the serial implementation 3 times, again reporting the
|
// And run the serial implementation 3 times, again reporting the
|
||||||
@@ -138,6 +162,9 @@ int main() {
|
|||||||
//
|
//
|
||||||
double minSerial = 1e30;
|
double minSerial = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
// Clear out the buffer
|
||||||
|
for (unsigned int i = 0; i < width * height; ++i)
|
||||||
|
buf[i] = 0;
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
|
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
|
|||||||
@@ -53,11 +53,14 @@ mandel(float c_re, float c_im, int count) {
|
|||||||
[ystart,yend).
|
[ystart,yend).
|
||||||
*/
|
*/
|
||||||
task void
|
task void
|
||||||
mandelbrot_scanlines(uniform int ystart, uniform int yend,
|
mandelbrot_scanlines(uniform int ybase, uniform int span,
|
||||||
uniform float x0, uniform float dx,
|
uniform float x0, uniform float dx,
|
||||||
uniform float y0, uniform float dy,
|
uniform float y0, uniform float dy,
|
||||||
uniform int width, uniform int maxIterations,
|
uniform int width, uniform int maxIterations,
|
||||||
reference uniform int output[]) {
|
reference uniform int output[]) {
|
||||||
|
uniform int ystart = ybase + taskIndex * span;
|
||||||
|
uniform int yend = ystart + span;
|
||||||
|
|
||||||
for (uniform int j = ystart; j < yend; ++j) {
|
for (uniform int j = ystart; j < yend; ++j) {
|
||||||
for (uniform int i = 0; i < width; i += programCount) {
|
for (uniform int i = 0; i < width; i += programCount) {
|
||||||
float x = x0 + (programIndex + i) * dx;
|
float x = x0 + (programIndex + i) * dx;
|
||||||
@@ -70,6 +73,20 @@ mandelbrot_scanlines(uniform int ystart, uniform int yend,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
task void
|
||||||
|
mandelbrot_chunk(uniform float x0, uniform float dx,
|
||||||
|
uniform float y0, uniform float dy,
|
||||||
|
uniform int width, uniform int height,
|
||||||
|
uniform int maxIterations, reference uniform int output[]) {
|
||||||
|
uniform int ystart = taskIndex * (height/taskCount);
|
||||||
|
uniform int yend = (taskIndex+1) * (height/taskCount);
|
||||||
|
uniform int span = 1;
|
||||||
|
|
||||||
|
launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
|
||||||
|
width, maxIterations, output) >;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
export void
|
export void
|
||||||
mandelbrot_ispc(uniform float x0, uniform float y0,
|
mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||||
uniform float x1, uniform float y1,
|
uniform float x1, uniform float y1,
|
||||||
@@ -78,9 +95,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
|
|||||||
uniform float dx = (x1 - x0) / width;
|
uniform float dx = (x1 - x0) / width;
|
||||||
uniform float dy = (y1 - y0) / height;
|
uniform float dy = (y1 - y0) / height;
|
||||||
|
|
||||||
/* Launch task to compute results for spans of 'span' scanlines. */
|
launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
|
||||||
uniform int span = 2;
|
|
||||||
for (uniform int j = 0; j < height; j += span)
|
|
||||||
launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
|
|
||||||
maxIterations, output) >;
|
maxIterations, output) >;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
|
|||||||
float z_re = c_re, z_im = c_im;
|
float z_re = c_re, z_im = c_im;
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < count; ++i) {
|
for (i = 0; i < count; ++i) {
|
||||||
if (z_re * z_re + z_im * z_im > 4.)
|
if (z_re * z_re + z_im * z_im > 4.f)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
float new_re = z_re*z_re - z_im*z_im;
|
float new_re = z_re*z_re - z_im*z_im;
|
||||||
|
|||||||
2
examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
Executable file → Normal file
2
examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
Executable file → Normal file
@@ -143,7 +143,7 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="mandelbrot.cpp" />
|
<ClCompile Include="mandelbrot.cpp" />
|
||||||
<ClCompile Include="mandelbrot_serial.cpp" />
|
<ClCompile Include="mandelbrot_serial.cpp" />
|
||||||
<ClCompile Include="../tasks_concrt.cpp" />
|
<ClCompile Include="../tasksys.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="mandelbrot.ispc">
|
<CustomBuild Include="mandelbrot.ispc">
|
||||||
|
|||||||
@@ -131,11 +131,11 @@ static float Noise(float x, float y, float z) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static float Turbulence(float x, float y, float z, int octaves) {
|
static float Turbulence(float x, float y, float z, uniform int octaves) {
|
||||||
float omega = 0.6;
|
float omega = 0.6;
|
||||||
|
|
||||||
float sum = 0., lambda = 1., o = 1.;
|
float sum = 0., lambda = 1., o = 1.;
|
||||||
for (int i = 0; i < octaves; ++i) {
|
for (uniform int i = 0; i < octaves; ++i) {
|
||||||
sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
|
sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
|
||||||
lambda *= 1.99f;
|
lambda *= 1.99f;
|
||||||
o *= omega;
|
o *= omega;
|
||||||
|
|||||||
0
examples/noise/noise.vcxproj
Executable file → Normal file
0
examples/noise/noise.vcxproj
Executable file → Normal file
@@ -104,7 +104,7 @@ inline float NoiseWeight(float t) {
|
|||||||
|
|
||||||
|
|
||||||
inline float Lerp(float t, float low, float high) {
|
inline float Lerp(float t, float low, float high) {
|
||||||
return (1. - t) * low + t * high;
|
return (1.f - t) * low + t * high;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -147,7 +147,7 @@ static float Turbulence(float x, float y, float z, int octaves) {
|
|||||||
lambda *= 1.99f;
|
lambda *= 1.99f;
|
||||||
o *= omega;
|
o *= omega;
|
||||||
}
|
}
|
||||||
return sum * 0.5;
|
return sum * 0.5f;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -163,7 +163,7 @@ void noise_serial(float x0, float y0, float x1, float y1,
|
|||||||
float y = y0 + j * dy;
|
float y = y0 + j * dy;
|
||||||
|
|
||||||
int index = (j * width + i);
|
int index = (j * width + i);
|
||||||
output[index] = Turbulence(x, y, 0.6, 8);
|
output[index] = Turbulence(x, y, 0.6f, 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
0
examples/options/options.vcxproj
Executable file → Normal file
0
examples/options/options.vcxproj
Executable file → Normal file
@@ -47,7 +47,7 @@ static inline float
|
|||||||
CND(float X) {
|
CND(float X) {
|
||||||
float L = fabsf(X);
|
float L = fabsf(X);
|
||||||
|
|
||||||
float k = 1.0 / (1.0 + 0.2316419 * L);
|
float k = 1.f / (1.f + 0.2316419f * L);
|
||||||
float k2 = k*k;
|
float k2 = k*k;
|
||||||
float k3 = k2*k;
|
float k3 = k2*k;
|
||||||
float k4 = k2*k2;
|
float k4 = k2*k2;
|
||||||
@@ -59,7 +59,7 @@ CND(float X) {
|
|||||||
w *= invSqrt2Pi * expf(-L * L * .5f);
|
w *= invSqrt2Pi * expf(-L * L * .5f);
|
||||||
|
|
||||||
if (X > 0.f)
|
if (X > 0.f)
|
||||||
w = 1.0 - w;
|
w = 1.f - w;
|
||||||
return w;
|
return w;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,7 +94,7 @@ binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
|||||||
|
|
||||||
float dt = T / BINOMIAL_NUM;
|
float dt = T / BINOMIAL_NUM;
|
||||||
float u = expf(v * sqrtf(dt));
|
float u = expf(v * sqrtf(dt));
|
||||||
float d = 1. / u;
|
float d = 1.f / u;
|
||||||
float disc = expf(r * dt);
|
float disc = expf(r * dt);
|
||||||
float Pu = (disc - d) / (u - d);
|
float Pu = (disc - d) / (u - d);
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,8 @@
|
|||||||
|
|
||||||
ARCH = $(shell uname)
|
ARCH = $(shell uname)
|
||||||
|
|
||||||
TASK_CXX=../tasks_pthreads.cpp
|
TASK_CXX=../tasksys.cpp
|
||||||
TASK_LIB=-lpthread
|
TASK_LIB=-lpthread
|
||||||
|
|
||||||
ifeq ($(ARCH), Darwin)
|
|
||||||
TASK_CXX=../tasks_gcd.cpp
|
|
||||||
TASK_LIB=
|
|
||||||
endif
|
|
||||||
|
|
||||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||||
|
|
||||||
CXX=g++
|
CXX=g++
|
||||||
|
|||||||
@@ -42,6 +42,7 @@
|
|||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
#include "../cpuid.h"
|
#include "../cpuid.h"
|
||||||
@@ -51,7 +52,8 @@ using namespace ispc;
|
|||||||
|
|
||||||
typedef unsigned int uint;
|
typedef unsigned int uint;
|
||||||
|
|
||||||
extern void raytrace_serial(int width, int height, const float raster2camera[4][4],
|
extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
|
||||||
|
const float raster2camera[4][4],
|
||||||
const float camera2world[4][4], float image[],
|
const float camera2world[4][4], float image[],
|
||||||
int id[], const LinearBVHNode nodes[],
|
int id[], const LinearBVHNode nodes[],
|
||||||
const Triangle triangles[]);
|
const Triangle triangles[]);
|
||||||
@@ -126,11 +128,28 @@ ensureTargetISAIsSupported() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
static void usage() {
|
||||||
if (argc != 2) {
|
fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
|
||||||
fprintf(stderr, "usage: rt <filename base>\n");
|
|
||||||
exit(1);
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
float scale = 1.f;
|
||||||
|
const char *filename = NULL;
|
||||||
|
for (int i = 1; i < argc; ++i) {
|
||||||
|
if (strncmp(argv[i], "--scale=", 8) == 0) {
|
||||||
|
scale = atof(argv[i] + 8);
|
||||||
|
if (scale == 0.f)
|
||||||
|
usage();
|
||||||
}
|
}
|
||||||
|
else if (filename != NULL)
|
||||||
|
usage();
|
||||||
|
else
|
||||||
|
filename = argv[i];
|
||||||
|
}
|
||||||
|
if (filename == NULL)
|
||||||
|
usage();
|
||||||
|
|
||||||
ensureTargetISAIsSupported();
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
@@ -144,10 +163,10 @@ int main(int argc, char *argv[]) {
|
|||||||
// Read the camera specification information from the camera file
|
// Read the camera specification information from the camera file
|
||||||
//
|
//
|
||||||
char fnbuf[1024];
|
char fnbuf[1024];
|
||||||
sprintf(fnbuf, "%s.camera", argv[1]);
|
sprintf(fnbuf, "%s.camera", filename);
|
||||||
FILE *f = fopen(fnbuf, "rb");
|
FILE *f = fopen(fnbuf, "rb");
|
||||||
if (!f) {
|
if (!f) {
|
||||||
perror(argv[1]);
|
perror(fnbuf);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -155,20 +174,20 @@ int main(int argc, char *argv[]) {
|
|||||||
// Nothing fancy, and trouble if we run on a big-endian system, just
|
// Nothing fancy, and trouble if we run on a big-endian system, just
|
||||||
// fread in the bits
|
// fread in the bits
|
||||||
//
|
//
|
||||||
int width, height;
|
int baseWidth, baseHeight;
|
||||||
float camera2world[4][4], raster2camera[4][4];
|
float camera2world[4][4], raster2camera[4][4];
|
||||||
READ(width, 1);
|
READ(baseWidth, 1);
|
||||||
READ(height, 1);
|
READ(baseHeight, 1);
|
||||||
READ(camera2world[0][0], 16);
|
READ(camera2world[0][0], 16);
|
||||||
READ(raster2camera[0][0], 16);
|
READ(raster2camera[0][0], 16);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Read in the serialized BVH
|
// Read in the serialized BVH
|
||||||
//
|
//
|
||||||
sprintf(fnbuf, "%s.bvh", argv[1]);
|
sprintf(fnbuf, "%s.bvh", filename);
|
||||||
f = fopen(fnbuf, "rb");
|
f = fopen(fnbuf, "rb");
|
||||||
if (!f) {
|
if (!f) {
|
||||||
perror(argv[2]);
|
perror(fnbuf);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -215,10 +234,10 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
fclose(f);
|
fclose(f);
|
||||||
|
|
||||||
// round image resolution up to multiple of 4 to make things easy for
|
// round image resolution up to multiple of 16 to make things easy for
|
||||||
// the code that assigns pixels to ispc program instances
|
// the code that assigns pixels to ispc program instances
|
||||||
height = (height + 3) & ~3;
|
int height = (int(baseHeight * scale) + 0xf) & ~0xf;
|
||||||
width = (width + 3) & ~3;
|
int width = (int(baseWidth * scale) + 0xf) & ~0xf;
|
||||||
|
|
||||||
// allocate images; one to hold hit object ids, one to hold depth to
|
// allocate images; one to hold hit object ids, one to hold depth to
|
||||||
// the first interseciton
|
// the first interseciton
|
||||||
@@ -231,8 +250,8 @@ int main(int argc, char *argv[]) {
|
|||||||
double minTimeISPC = 1e30;
|
double minTimeISPC = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
raytrace_ispc(width, height, raster2camera, camera2world,
|
raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera,
|
||||||
image, id, nodes, triangles);
|
camera2world, image, id, nodes, triangles);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
minTimeISPC = std::min(dt, minTimeISPC);
|
minTimeISPC = std::min(dt, minTimeISPC);
|
||||||
}
|
}
|
||||||
@@ -250,8 +269,8 @@ int main(int argc, char *argv[]) {
|
|||||||
double minTimeISPCtasks = 1e30;
|
double minTimeISPCtasks = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
raytrace_ispc_tasks(width, height, raster2camera, camera2world,
|
raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
|
||||||
image, id, nodes, triangles);
|
camera2world, image, id, nodes, triangles);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
|
minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
|
||||||
}
|
}
|
||||||
@@ -270,8 +289,8 @@ int main(int argc, char *argv[]) {
|
|||||||
double minTimeSerial = 1e30;
|
double minTimeSerial = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
raytrace_serial(width, height, raster2camera, camera2world,
|
raytrace_serial(width, height, baseWidth, baseHeight, raster2camera,
|
||||||
image, id, nodes, triangles);
|
camera2world, image, id, nodes, triangles);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
minTimeSerial = std::min(dt, minTimeSerial);
|
minTimeSerial = std::min(dt, minTimeSerial);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -227,12 +227,17 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
|||||||
|
|
||||||
|
|
||||||
static void raytrace_tile(uniform int x0, uniform int x1,
|
static void raytrace_tile(uniform int x0, uniform int x1,
|
||||||
uniform int y0, uniform int y1, uniform int width,
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int width, uniform int height,
|
||||||
|
uniform int baseWidth, uniform int baseHeight,
|
||||||
const uniform float raster2camera[4][4],
|
const uniform float raster2camera[4][4],
|
||||||
const uniform float camera2world[4][4],
|
const uniform float camera2world[4][4],
|
||||||
uniform float image[], uniform int id[],
|
uniform float image[], uniform int id[],
|
||||||
const LinearBVHNode nodes[],
|
const LinearBVHNode nodes[],
|
||||||
const Triangle triangles[]) {
|
const Triangle triangles[]) {
|
||||||
|
uniform float widthScale = (float)(baseWidth) / (float)(width);
|
||||||
|
uniform float heightScale = (float)(baseHeight) / (float)(height);
|
||||||
|
|
||||||
static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||||
static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||||
@@ -252,7 +257,8 @@ static void raytrace_tile(uniform int x0, uniform int x1,
|
|||||||
const float dy = udy[o * programCount + programIndex];
|
const float dy = udy[o * programCount + programIndex];
|
||||||
|
|
||||||
Ray ray;
|
Ray ray;
|
||||||
generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
|
generateRay(raster2camera, camera2world, (x+dx)*widthScale,
|
||||||
|
(y+dy)*heightScale, ray);
|
||||||
BVHIntersect(nodes, triangles, ray);
|
BVHIntersect(nodes, triangles, ray);
|
||||||
|
|
||||||
int offset = (y + (int)dy) * width + (x + (int)dx);
|
int offset = (y + (int)dy) * width + (x + (int)dx);
|
||||||
@@ -265,42 +271,51 @@ static void raytrace_tile(uniform int x0, uniform int x1,
|
|||||||
|
|
||||||
|
|
||||||
export void raytrace_ispc(uniform int width, uniform int height,
|
export void raytrace_ispc(uniform int width, uniform int height,
|
||||||
|
uniform int baseWidth, uniform int baseHeight,
|
||||||
const uniform float raster2camera[4][4],
|
const uniform float raster2camera[4][4],
|
||||||
const uniform float camera2world[4][4],
|
const uniform float camera2world[4][4],
|
||||||
uniform float image[], uniform int id[],
|
uniform float image[], uniform int id[],
|
||||||
const LinearBVHNode nodes[],
|
const LinearBVHNode nodes[],
|
||||||
const Triangle triangles[]) {
|
const Triangle triangles[]) {
|
||||||
raytrace_tile(0, width, 0, height, width, raster2camera, camera2world, image,
|
raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
|
||||||
|
raster2camera, camera2world, image,
|
||||||
id, nodes, triangles);
|
id, nodes, triangles);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
task void raytrace_tile_task(uniform int x0, uniform int x1,
|
task void raytrace_tile_task(uniform int y0, uniform int y1,
|
||||||
uniform int y0, uniform int y1, uniform int width,
|
uniform int width, uniform int height,
|
||||||
|
uniform int baseWidth, uniform int baseHeight,
|
||||||
const uniform float raster2camera[4][4],
|
const uniform float raster2camera[4][4],
|
||||||
const uniform float camera2world[4][4],
|
const uniform float camera2world[4][4],
|
||||||
uniform float image[], uniform int id[],
|
uniform float image[], uniform int id[],
|
||||||
const LinearBVHNode nodes[],
|
const LinearBVHNode nodes[],
|
||||||
const Triangle triangles[]) {
|
const Triangle triangles[]) {
|
||||||
raytrace_tile(x0, x1, y0, y1, width, raster2camera, camera2world, image,
|
uniform int dx = 16; // must match dx below
|
||||||
|
uniform int xTasks = (width + (dx-1)) / dx;
|
||||||
|
uniform int x0 = (taskIndex % xTasks) * dx;
|
||||||
|
uniform int x1 = x0 + dx;
|
||||||
|
x1 = min(x1, width);
|
||||||
|
|
||||||
|
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||||
|
raster2camera, camera2world, image,
|
||||||
id, nodes, triangles);
|
id, nodes, triangles);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
||||||
|
uniform int baseWidth, uniform int baseHeight,
|
||||||
const uniform float raster2camera[4][4],
|
const uniform float raster2camera[4][4],
|
||||||
const uniform float camera2world[4][4],
|
const uniform float camera2world[4][4],
|
||||||
uniform float image[], uniform int id[],
|
uniform float image[], uniform int id[],
|
||||||
const LinearBVHNode nodes[],
|
const LinearBVHNode nodes[],
|
||||||
const Triangle triangles[]) {
|
const Triangle triangles[]) {
|
||||||
uniform int dx = 16, dy = 16;
|
uniform int dx = 16, dy = 16;
|
||||||
|
uniform int nTasks = (width + (dx-1)) / dx;
|
||||||
for (uniform int y = 0; y < height; y += dy) {
|
for (uniform int y = 0; y < height; y += dy) {
|
||||||
uniform int y1 = min(y + dy, height);
|
uniform int y1 = min(y + dy, height);
|
||||||
for (uniform int x = 0; x < width; x += dx) {
|
launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth,
|
||||||
uniform int x1 = min(x + dx, width);
|
baseHeight, raster2camera, camera2world,
|
||||||
launch < raytrace_tile_task(x, x1, y, y1, width, raster2camera,
|
image, id, nodes, triangles) >;
|
||||||
camera2world, image, id, nodes,
|
|
||||||
triangles) >;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
2
examples/rt/rt.vcxproj
Executable file → Normal file
2
examples/rt/rt.vcxproj
Executable file → Normal file
@@ -164,7 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="rt.cpp" />
|
<ClCompile Include="rt.cpp" />
|
||||||
<ClCompile Include="rt_serial.cpp" />
|
<ClCompile Include="rt_serial.cpp" />
|
||||||
<ClCompile Include="../tasks_concrt.cpp" />
|
<ClCompile Include="../tasksys.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
|||||||
@@ -258,17 +258,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void raytrace_serial(int width, int height,
|
void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
|
||||||
const float raster2camera[4][4],
|
const float raster2camera[4][4],
|
||||||
const float camera2world[4][4],
|
const float camera2world[4][4],
|
||||||
float image[],
|
float image[],
|
||||||
int id[],
|
int id[],
|
||||||
const LinearBVHNode nodes[],
|
const LinearBVHNode nodes[],
|
||||||
const Triangle triangles[]) {
|
const Triangle triangles[]) {
|
||||||
|
float widthScale = float(baseWidth) / float(width);
|
||||||
|
float heightScale = float(baseHeight) / float(height);
|
||||||
|
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
for (int x = 0; x < width; ++x) {
|
for (int x = 0; x < width; ++x) {
|
||||||
Ray ray;
|
Ray ray;
|
||||||
generateRay(raster2camera, camera2world, x, y, ray);
|
generateRay(raster2camera, camera2world, x * widthScale,
|
||||||
|
y * heightScale, ray);
|
||||||
BVHIntersect(nodes, triangles, ray);
|
BVHIntersect(nodes, triangles, ray);
|
||||||
|
|
||||||
int offset = y * width + x;
|
int offset = y * width + x;
|
||||||
|
|||||||
4
examples/simple/simple.vcxproj
Executable file → Normal file
4
examples/simple/simple.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup Label="ProjectConfigurations">
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
<ProjectConfiguration Include="Debug|Win32">
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
@@ -28,7 +28,7 @@
|
|||||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||||
|
|||||||
@@ -1,14 +1,8 @@
|
|||||||
|
|
||||||
ARCH = $(shell uname)
|
ARCH = $(shell uname)
|
||||||
|
|
||||||
TASK_CXX=../tasks_pthreads.cpp
|
TASK_CXX=../tasksys.cpp
|
||||||
TASK_LIB=-lpthread
|
TASK_LIB=-lpthread
|
||||||
|
|
||||||
ifeq ($(ARCH), Darwin)
|
|
||||||
TASK_CXX=../tasks_gcd.cpp
|
|
||||||
TASK_LIB=
|
|
||||||
endif
|
|
||||||
|
|
||||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||||
|
|
||||||
CXX=g++
|
CXX=g++
|
||||||
|
|||||||
@@ -116,20 +116,38 @@ int main() {
|
|||||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Compute the image using the ispc implementation; report the minimum
|
// Compute the image using the ispc implementation on one core; report
|
||||||
// time of three runs.
|
// the minimum time of three runs.
|
||||||
//
|
//
|
||||||
double minISPC = 1e30;
|
double minTimeISPC = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
Aispc[0], Aispc[1]);
|
Aispc[0], Aispc[1]);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
minISPC = std::min(minISPC, dt);
|
minTimeISPC = std::min(minTimeISPC, dt);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("[stencil ispc]:\t\t\t[%.3f] million cycles\n", minISPC);
|
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
||||||
|
|
||||||
|
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Compute the image using the ispc implementation with tasks; report
|
||||||
|
// the minimum time of three runs.
|
||||||
|
//
|
||||||
|
double minTimeISPCTasks = 1e30;
|
||||||
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
reset_and_start_timer();
|
||||||
|
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||||
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
|
Aispc[0], Aispc[1]);
|
||||||
|
double dt = get_elapsed_mcycles();
|
||||||
|
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
||||||
|
|
||||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||||
|
|
||||||
@@ -137,19 +155,20 @@ int main() {
|
|||||||
// And run the serial implementation 3 times, again reporting the
|
// And run the serial implementation 3 times, again reporting the
|
||||||
// minimum time.
|
// minimum time.
|
||||||
//
|
//
|
||||||
double minSerial = 1e30;
|
double minTimeSerial = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
Aserial[0], Aserial[1]);
|
Aserial[0], Aserial[1]);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
minSerial = std::min(minSerial, dt);
|
minTimeSerial = std::min(minTimeSerial, dt);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minSerial);
|
printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
|
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n",
|
||||||
|
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||||
|
|
||||||
// Check for agreement
|
// Check for agreement
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
|
|||||||
@@ -32,7 +32,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
static task void
|
static void
|
||||||
stencil_step(uniform int x0, uniform int x1,
|
stencil_step(uniform int x0, uniform int x1,
|
||||||
uniform int y0, uniform int y1,
|
uniform int y0, uniform int y1,
|
||||||
uniform int z0, uniform int z1,
|
uniform int z0, uniform int z1,
|
||||||
@@ -67,7 +67,19 @@ stencil_step(uniform int x0, uniform int x1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export void loop_stencil_ispc(uniform int t0, uniform int t1,
|
static task void
|
||||||
|
stencil_step_task(uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0, uniform int z1,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const float coef[4], uniform const float vsq[],
|
||||||
|
uniform const float Ain[], uniform float Aout[]) {
|
||||||
|
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export void
|
||||||
|
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||||
uniform int x0, uniform int x1,
|
uniform int x0, uniform int x1,
|
||||||
uniform int y0, uniform int y1,
|
uniform int y0, uniform int y1,
|
||||||
uniform int z0, uniform int z1,
|
uniform int z0, uniform int z1,
|
||||||
@@ -83,14 +95,35 @@ export void loop_stencil_ispc(uniform int t0, uniform int t1,
|
|||||||
uniform int dz = 1;
|
uniform int dz = 1;
|
||||||
for (uniform int z = z0; z < z1; z += dz) {
|
for (uniform int z = z0; z < z1; z += dz) {
|
||||||
if ((t & 1) == 0)
|
if ((t & 1) == 0)
|
||||||
launch < stencil_step(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, coef, vsq,
|
launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz,
|
||||||
Aeven, Aodd) >;
|
coef, vsq, Aeven, Aodd) >;
|
||||||
else
|
else
|
||||||
launch < stencil_step(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, coef, vsq,
|
launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz,
|
||||||
Aodd, Aeven) >;
|
coef, vsq, Aodd, Aeven) >;
|
||||||
}
|
}
|
||||||
// We need to wait for all of the launched tasks to finish before
|
// We need to wait for all of the launched tasks to finish before
|
||||||
// starting the next iteration.
|
// starting the next iteration.
|
||||||
sync;
|
sync;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export void
|
||||||
|
loop_stencil_ispc(uniform int t0, uniform int t1,
|
||||||
|
uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0, uniform int z1,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const float coef[4],
|
||||||
|
uniform const float vsq[],
|
||||||
|
uniform float Aeven[], uniform float Aodd[])
|
||||||
|
{
|
||||||
|
for (uniform int t = t0; t < t1; ++t) {
|
||||||
|
if ((t & 1) == 0)
|
||||||
|
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||||
|
Aeven, Aodd);
|
||||||
|
else
|
||||||
|
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||||
|
Aodd, Aeven);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
2
examples/stencil/stencil.vcxproj
Executable file → Normal file
2
examples/stencil/stencil.vcxproj
Executable file → Normal file
@@ -164,7 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="stencil.cpp" />
|
<ClCompile Include="stencil.cpp" />
|
||||||
<ClCompile Include="stencil_serial.cpp" />
|
<ClCompile Include="stencil_serial.cpp" />
|
||||||
<ClCompile Include="../tasks_concrt.cpp" />
|
<ClCompile Include="../tasksys.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
|||||||
@@ -1,180 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright (c) 2011, Intel Corporation
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the name of Intel Corporation nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived from
|
|
||||||
this software without specific prior written permission.
|
|
||||||
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef TASKINFO_H
|
|
||||||
#define TASKINFO_H 1
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define ISPC_IS_WINDOWS
|
|
||||||
#elif defined(__linux__)
|
|
||||||
#define ISPC_IS_LINUX
|
|
||||||
#elif defined(__APPLE__)
|
|
||||||
#define ISPC_IS_APPLE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef ISPC_IS_WINDOWS
|
|
||||||
#define NOMINMAX
|
|
||||||
#include <windows.h>
|
|
||||||
#include <concrt.h>
|
|
||||||
using namespace Concurrency;
|
|
||||||
#endif // ISPC_IS_WINDOWS
|
|
||||||
|
|
||||||
#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
|
|
||||||
#define ISPC_POINTER_BYTES 4
|
|
||||||
#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
|
|
||||||
#define ISPC_POINTER_BYTES 8
|
|
||||||
#else
|
|
||||||
#error "Pointer size unknown!"
|
|
||||||
#endif // __SIZEOF_POINTER__
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
|
||||||
|
|
||||||
typedef struct TaskInfo {
|
|
||||||
void *func;
|
|
||||||
void *data;
|
|
||||||
#if defined(ISPC_IS_WINDOWS)
|
|
||||||
event taskEvent;
|
|
||||||
#endif
|
|
||||||
} TaskInfo;
|
|
||||||
|
|
||||||
|
|
||||||
#ifndef ISPC_IS_WINDOWS
|
|
||||||
static int32_t
|
|
||||||
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
|
|
||||||
int32_t result;
|
|
||||||
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
|
|
||||||
: "=a"(result), "=m"(*v)
|
|
||||||
: "q"(newValue), "0"(oldValue)
|
|
||||||
: "memory");
|
|
||||||
__asm__ __volatile__("mfence":::"memory");
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
#endif // !ISPC_IS_WINDOWS
|
|
||||||
|
|
||||||
|
|
||||||
static void *
|
|
||||||
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
|
||||||
#ifdef ISPC_IS_WINDOWS
|
|
||||||
return InterlockedCompareExchangePointer(v, newValue, oldValue);
|
|
||||||
#else
|
|
||||||
void *result;
|
|
||||||
#if (ISPC_POINTER_BYTES == 4)
|
|
||||||
__asm__ __volatile__("lock\ncmpxchgd %2,%1"
|
|
||||||
: "=a"(result), "=m"(*v)
|
|
||||||
: "q"(newValue), "0"(oldValue)
|
|
||||||
: "memory");
|
|
||||||
#else
|
|
||||||
__asm__ __volatile__("lock\ncmpxchgq %2,%1"
|
|
||||||
: "=a"(result), "=m"(*v)
|
|
||||||
: "q"(newValue), "0"(oldValue)
|
|
||||||
: "memory");
|
|
||||||
#endif // ISPC_POINTER_BYTES
|
|
||||||
__asm__ __volatile__("mfence":::"memory");
|
|
||||||
return result;
|
|
||||||
#endif // ISPC_IS_WINDOWS
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#ifndef ISPC_IS_WINDOWS
|
|
||||||
static int32_t
|
|
||||||
lAtomicAdd32(volatile int32_t *v, int32_t delta) {
|
|
||||||
// Do atomic add with gcc x86 inline assembly
|
|
||||||
int32_t origValue;
|
|
||||||
__asm__ __volatile__("lock\n"
|
|
||||||
"xaddl %0,%1"
|
|
||||||
: "=r"(origValue), "=m"(*v) : "0"(delta)
|
|
||||||
: "memory");
|
|
||||||
return origValue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define LOG_TASK_QUEUE_CHUNK_SIZE 13
|
|
||||||
#define MAX_TASK_QUEUE_CHUNKS 1024
|
|
||||||
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
|
||||||
|
|
||||||
#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
|
|
||||||
|
|
||||||
typedef void (*TaskFuncType)(void *, int, int);
|
|
||||||
|
|
||||||
#ifdef ISPC_IS_WINDOWS
|
|
||||||
static volatile LONG nextTaskInfoCoordinate;
|
|
||||||
#else
|
|
||||||
static volatile int nextTaskInfoCoordinate;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
lInitTaskInfo() {
|
|
||||||
taskInfo[0] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static inline TaskInfo *
|
|
||||||
lGetTaskInfo() {
|
|
||||||
#ifdef ISPC_IS_WINDOWS
|
|
||||||
int myCoord = InterlockedAdd(&nextTaskInfoCoordinate, 1)-1;
|
|
||||||
#else
|
|
||||||
int myCoord = lAtomicAdd32(&nextTaskInfoCoordinate, 1);
|
|
||||||
#endif
|
|
||||||
int index = (myCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
|
||||||
int offset = myCoord & (TASK_QUEUE_CHUNK_SIZE-1);
|
|
||||||
if (index == MAX_TASK_QUEUE_CHUNKS) {
|
|
||||||
fprintf(stderr, "A total of %d tasks have been launched--the simple "
|
|
||||||
"built-in task system can handle no more. Exiting.", myCoord);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (taskInfo[index] == NULL) {
|
|
||||||
TaskInfo *newChunk = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
|
|
||||||
if (lAtomicCompareAndSwapPointer((void **)&taskInfo[index], newChunk,
|
|
||||||
NULL) != NULL) {
|
|
||||||
// failure--someone else got it, but that's cool
|
|
||||||
assert(taskInfo[index] != NULL);
|
|
||||||
free(newChunk);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return &taskInfo[index][offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
lResetTaskInfo() {
|
|
||||||
nextTaskInfoCoordinate = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // TASKINFO_H
|
|
||||||
@@ -1,104 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright (c) 2011, Intel Corporation
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the name of Intel Corporation nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived from
|
|
||||||
this software without specific prior written permission.
|
|
||||||
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "taskinfo.h"
|
|
||||||
|
|
||||||
/* Simple task system implementation for ispc based on Microsoft's
|
|
||||||
Concurrency Runtime. */
|
|
||||||
|
|
||||||
#include <windows.h>
|
|
||||||
#include <concrt.h>
|
|
||||||
using namespace Concurrency;
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
// ispc expects these functions to have C linkage / not be mangled
|
|
||||||
extern "C" {
|
|
||||||
void ISPCLaunch(void *f, void *data);
|
|
||||||
void ISPCSync();
|
|
||||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
|
||||||
void ISPCFree(void *ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void __cdecl
|
|
||||||
lRunTask(LPVOID param) {
|
|
||||||
TaskInfo *ti = (TaskInfo *)param;
|
|
||||||
|
|
||||||
// Actually run the task.
|
|
||||||
// FIXME: like the GCD implementation for OS X, this is passing bogus
|
|
||||||
// values for the threadIndex and threadCount builtins, which in turn
|
|
||||||
// will cause bugs in code that uses those.
|
|
||||||
int threadIndex = 0;
|
|
||||||
int threadCount = 1;
|
|
||||||
TaskFuncType func = (TaskFuncType)ti->func;
|
|
||||||
func(ti->data, threadIndex, threadCount);
|
|
||||||
|
|
||||||
// Signal the event that this task is done
|
|
||||||
ti->taskEvent.set();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void
|
|
||||||
ISPCLaunch(void *func, void *data) {
|
|
||||||
TaskInfo *ti = lGetTaskInfo();
|
|
||||||
ti->func = (TaskFuncType)func;
|
|
||||||
ti->data = data;
|
|
||||||
ti->taskEvent.reset();
|
|
||||||
CurrentScheduler::ScheduleTask(lRunTask, ti);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void ISPCSync() {
|
|
||||||
for (int i = 0; i < nextTaskInfoCoordinate; ++i) {
|
|
||||||
int index = (i >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
|
||||||
int offset = i & (TASK_QUEUE_CHUNK_SIZE-1);
|
|
||||||
taskInfo[index][offset].taskEvent.wait();
|
|
||||||
taskInfo[index][offset].taskEvent.reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
lResetTaskInfo();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
|
||||||
return _aligned_malloc(size, alignment);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void ISPCFree(void *ptr) {
|
|
||||||
_aligned_free(ptr);
|
|
||||||
}
|
|
||||||
@@ -1,99 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright (c) 2011, Intel Corporation
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the name of Intel Corporation nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived from
|
|
||||||
this software without specific prior written permission.
|
|
||||||
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "taskinfo.h"
|
|
||||||
|
|
||||||
/* A simple task system for ispc programs based on Apple's Grand Central
|
|
||||||
Dispatch. */
|
|
||||||
#include <dispatch/dispatch.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
static int initialized = 0;
|
|
||||||
static volatile int32_t lock = 0;
|
|
||||||
static dispatch_queue_t gcdQueue;
|
|
||||||
static dispatch_group_t gcdGroup;
|
|
||||||
|
|
||||||
// ispc expects these functions to have C linkage / not be mangled
|
|
||||||
extern "C" {
|
|
||||||
void ISPCLaunch(void *f, void *data);
|
|
||||||
void ISPCSync();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void
|
|
||||||
lRunTask(void *ti) {
|
|
||||||
TaskInfo *taskInfo = (TaskInfo *)ti;
|
|
||||||
// FIXME: these are bogus values; may cause bugs in code that depends
|
|
||||||
// on them having unique values in different threads.
|
|
||||||
int threadIndex = 0;
|
|
||||||
int threadCount = 1;
|
|
||||||
TaskFuncType func = (TaskFuncType)(taskInfo->func);
|
|
||||||
|
|
||||||
// Actually run the task
|
|
||||||
func(taskInfo->data, threadIndex, threadCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void ISPCLaunch(void *func, void *data) {
|
|
||||||
if (!initialized) {
|
|
||||||
while (1) {
|
|
||||||
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
|
||||||
if (!initialized) {
|
|
||||||
gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
|
||||||
gcdGroup = dispatch_group_create();
|
|
||||||
lInitTaskInfo();
|
|
||||||
__asm__ __volatile__("mfence":::"memory");
|
|
||||||
initialized = 1;
|
|
||||||
}
|
|
||||||
lock = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TaskInfo *ti = lGetTaskInfo();
|
|
||||||
ti->func = func;
|
|
||||||
ti->data = data;
|
|
||||||
dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void ISPCSync() {
|
|
||||||
if (!initialized)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Wait for all of the tasks in the group to complete before returning
|
|
||||||
dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
|
|
||||||
|
|
||||||
lResetTaskInfo();
|
|
||||||
}
|
|
||||||
@@ -1,294 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright (c) 2011, Intel Corporation
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the name of Intel Corporation nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived from
|
|
||||||
this software without specific prior written permission.
|
|
||||||
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "taskinfo.h"
|
|
||||||
#include <pthread.h>
|
|
||||||
#include <semaphore.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <sys/param.h>
|
|
||||||
#include <sys/sysctl.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <errno.h>
|
|
||||||
|
|
||||||
static int initialized = 0;
|
|
||||||
static volatile int32_t lock = 0;
|
|
||||||
|
|
||||||
static int nThreads;
|
|
||||||
static pthread_t *threads;
|
|
||||||
static pthread_mutex_t taskQueueMutex;
|
|
||||||
static int nextTaskToRun;
|
|
||||||
static sem_t *workerSemaphore;
|
|
||||||
static uint32_t numUnfinishedTasks;
|
|
||||||
static pthread_mutex_t tasksRunningConditionMutex;
|
|
||||||
static pthread_cond_t tasksRunningCondition;
|
|
||||||
|
|
||||||
// ispc expects these functions to have C linkage / not be mangled
|
|
||||||
extern "C" {
|
|
||||||
void ISPCLaunch(void *f, void *data);
|
|
||||||
void ISPCSync();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void *lTaskEntry(void *arg);
|
|
||||||
|
|
||||||
/** Figure out how many CPU cores there are in the system
|
|
||||||
*/
|
|
||||||
static int
|
|
||||||
lNumCPUCores() {
|
|
||||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void
|
|
||||||
lTasksInit() {
|
|
||||||
nThreads = lNumCPUCores();
|
|
||||||
|
|
||||||
threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
|
|
||||||
|
|
||||||
int err;
|
|
||||||
if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
|
|
||||||
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
char name[32];
|
|
||||||
sprintf(name, "ispc_task.%d", (int)getpid());
|
|
||||||
workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
|
|
||||||
if (!workerSemaphore) {
|
|
||||||
fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
|
|
||||||
fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
|
|
||||||
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < nThreads; ++i) {
|
|
||||||
err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
|
|
||||||
if (err != 0) {
|
|
||||||
fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void
|
|
||||||
ISPCLaunch(void *f, void *d) {
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (!initialized) {
|
|
||||||
while (1) {
|
|
||||||
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
|
||||||
if (!initialized) {
|
|
||||||
lTasksInit();
|
|
||||||
__asm__ __volatile__("mfence":::"memory");
|
|
||||||
initialized = 1;
|
|
||||||
}
|
|
||||||
lock = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// Acquire mutex, add task
|
|
||||||
//
|
|
||||||
if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Need a mutex here to ensure we get this filled in before a worker
|
|
||||||
// grabs it and starts running...
|
|
||||||
TaskInfo *ti = lGetTaskInfo();
|
|
||||||
ti->func = f;
|
|
||||||
ti->data = d;
|
|
||||||
|
|
||||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// Update count of number of tasks left to run
|
|
||||||
//
|
|
||||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME: is this redundant with nextTaskInfoCoordinate?
|
|
||||||
++numUnfinishedTasks;
|
|
||||||
|
|
||||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// Post to the worker semaphore to wake up worker threads that are
|
|
||||||
// sleeping waiting for tasks to show up
|
|
||||||
//
|
|
||||||
if ((err = sem_post(workerSemaphore)) != 0) {
|
|
||||||
fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void *
|
|
||||||
lTaskEntry(void *arg) {
|
|
||||||
int threadIndex = (int)arg;
|
|
||||||
int threadCount = nThreads;
|
|
||||||
TaskFuncType func;
|
|
||||||
|
|
||||||
while (1) {
|
|
||||||
int err;
|
|
||||||
if ((err = sem_wait(workerSemaphore)) != 0) {
|
|
||||||
fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// Acquire mutex, get task
|
|
||||||
//
|
|
||||||
if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nextTaskToRun == nextTaskInfoCoordinate) {
|
|
||||||
//
|
|
||||||
// Task queue is empty, go back and wait on the semaphore
|
|
||||||
//
|
|
||||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int runCoord = nextTaskToRun++;
|
|
||||||
int index = (runCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
|
||||||
int offset = runCoord & (TASK_QUEUE_CHUNK_SIZE-1);
|
|
||||||
TaskInfo *myTask = &taskInfo[index][offset];
|
|
||||||
|
|
||||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// Do work for _myTask_
|
|
||||||
//
|
|
||||||
func = (TaskFuncType)myTask->func;
|
|
||||||
func(myTask->data, threadIndex, threadCount);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Decrement the number of unfinished tasks counter
|
|
||||||
//
|
|
||||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME: can this be a comparison of (nextTaskToRun == nextTaskInfoCoordinate)?
|
|
||||||
// (I don't think so--think there is a race...)
|
|
||||||
int unfinished = --numUnfinishedTasks;
|
|
||||||
if (unfinished == 0) {
|
|
||||||
//
|
|
||||||
// Signal the "no more tasks are running" condition if all of
|
|
||||||
// them are done.
|
|
||||||
//
|
|
||||||
int err;
|
|
||||||
if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pthread_exit(NULL);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void ISPCSync() {
|
|
||||||
int err;
|
|
||||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// As long as there are tasks running, wait on the condition variable;
|
|
||||||
// doing so causes this thread to go to sleep until someone signals on
|
|
||||||
// the tasksRunningCondition condition variable.
|
|
||||||
while (numUnfinishedTasks > 0) {
|
|
||||||
if ((err = pthread_cond_wait(&tasksRunningCondition,
|
|
||||||
&tasksRunningConditionMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
lResetTaskInfo();
|
|
||||||
nextTaskToRun = 0;
|
|
||||||
|
|
||||||
// We acquire ownership of the condition variable mutex when the above
|
|
||||||
// pthread_cond_wait returns.
|
|
||||||
// FIXME: is there a lurking issue here if numUnfinishedTasks gets back
|
|
||||||
// to zero by the time we get to ISPCSync() and thence we're trying to
|
|
||||||
// unlock a mutex we don't have a lock on?
|
|
||||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
|
||||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
868
examples/tasksys.cpp
Normal file
868
examples/tasksys.cpp
Normal file
@@ -0,0 +1,868 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
This file implements simple task systems that provide the three
|
||||||
|
entrypoints used by ispc-generated to code to handle 'launch' and 'sync'
|
||||||
|
statements in ispc programs. See the section "Task Parallelism: Language
|
||||||
|
Syntax" in the ispc documentation for information about using task
|
||||||
|
parallelism in ispc programs, and see the section "Task Parallelism:
|
||||||
|
Runtime Requirements" for information about the task-related entrypoints
|
||||||
|
that are implemented here.
|
||||||
|
|
||||||
|
There are three task systems in this file: one built using Microsoft's
|
||||||
|
Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
|
||||||
|
one built on top of bare pthreads.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#define ISPC_USE_CONCRT
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#define ISPC_USE_PTHREADS
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
// pthreads is noticably more efficient than GCD on OSX
|
||||||
|
#define ISPC_USE_PTHREADS
|
||||||
|
//#define ISPC_USE_GCD
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define DBG(x)
|
||||||
|
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
#define NOMINMAX
|
||||||
|
#include <windows.h>
|
||||||
|
#endif // ISPC_IS_WINDOWS
|
||||||
|
#ifdef ISPC_USE_CONCRT
|
||||||
|
#include <concrt.h>
|
||||||
|
using namespace Concurrency;
|
||||||
|
#endif // ISPC_USE_CONCRT
|
||||||
|
#ifdef ISPC_USE_GCD
|
||||||
|
#include <dispatch/dispatch.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
#endif // ISPC_USE_GCD
|
||||||
|
#ifdef ISPC_USE_PTHREADS
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <semaphore.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/param.h>
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#endif // ISPC_USE_PTHREADS
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif // ISPC_IS_LINUX
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
// Signature of ispc-generated 'task' functions
|
||||||
|
typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
|
||||||
|
int taskIndex, int taskCount);
|
||||||
|
|
||||||
|
// Small structure used to hold the data for each task
|
||||||
|
struct TaskInfo {
|
||||||
|
TaskFuncType func;
|
||||||
|
void *data;
|
||||||
|
int taskIndex, taskCount;
|
||||||
|
#if defined(ISPC_IS_WINDOWS)
|
||||||
|
event taskEvent;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// TaskGroupBase
|
||||||
|
|
||||||
|
#define LOG_TASK_QUEUE_CHUNK_SIZE 12
|
||||||
|
#define MAX_TASK_QUEUE_CHUNKS 8
|
||||||
|
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
||||||
|
|
||||||
|
#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
|
||||||
|
|
||||||
|
#define NUM_MEM_BUFFERS 16
|
||||||
|
|
||||||
|
class TaskGroup;
|
||||||
|
|
||||||
|
/** The TaskGroupBase structure provides common functionality for "task
|
||||||
|
groups"; a task group is the set of tasks launched from within a single
|
||||||
|
ispc function. When the function is ready to return, it waits for all
|
||||||
|
of the tasks in its task group to finish before it actually returns.
|
||||||
|
*/
|
||||||
|
class TaskGroupBase {
|
||||||
|
public:
|
||||||
|
void Reset();
|
||||||
|
|
||||||
|
int AllocTaskInfo(int count);
|
||||||
|
TaskInfo *GetTaskInfo(int index);
|
||||||
|
|
||||||
|
void *AllocMemory(int64_t size, int32_t alignment);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
TaskGroupBase();
|
||||||
|
~TaskGroupBase();
|
||||||
|
|
||||||
|
int nextTaskInfoIndex;
|
||||||
|
|
||||||
|
private:
|
||||||
|
/* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
|
||||||
|
needed by the calling function. We hold up to MAX_TASK_QUEUE_CHUNKS
|
||||||
|
of these (and then exit at runtime if more than this many tasks are
|
||||||
|
launched.)
|
||||||
|
*/
|
||||||
|
TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
|
||||||
|
|
||||||
|
/* We also allocate chunks of memory to service ISPCAlloc() calls. The
|
||||||
|
memBuffers[] array holds pointers to this memory. The first element
|
||||||
|
of this array is initialized to point to mem and then any subsequent
|
||||||
|
elements required are initialized with dynamic allocation.
|
||||||
|
*/
|
||||||
|
int curMemBuffer, curMemBufferOffset;
|
||||||
|
int memBufferSize[NUM_MEM_BUFFERS];
|
||||||
|
char *memBuffers[NUM_MEM_BUFFERS];
|
||||||
|
char mem[256];
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
inline TaskGroupBase::TaskGroupBase() {
|
||||||
|
nextTaskInfoIndex = 0;
|
||||||
|
|
||||||
|
curMemBuffer = 0;
|
||||||
|
curMemBufferOffset = 0;
|
||||||
|
memBuffers[0] = mem;
|
||||||
|
memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
|
||||||
|
for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
|
||||||
|
memBuffers[i] = NULL;
|
||||||
|
memBufferSize[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
|
||||||
|
taskInfo[i] = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline TaskGroupBase::~TaskGroupBase() {
|
||||||
|
// Note: don't delete memBuffers[0], since it points to the start of
|
||||||
|
// the "mem" member!
|
||||||
|
for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
|
||||||
|
delete[] memBuffers[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void
|
||||||
|
TaskGroupBase::Reset() {
|
||||||
|
nextTaskInfoIndex = 0;
|
||||||
|
curMemBuffer = 0;
|
||||||
|
curMemBufferOffset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline int
|
||||||
|
TaskGroupBase::AllocTaskInfo(int count) {
|
||||||
|
int ret = nextTaskInfoIndex;
|
||||||
|
nextTaskInfoIndex += count;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline TaskInfo *
|
||||||
|
TaskGroupBase::GetTaskInfo(int index) {
|
||||||
|
int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
||||||
|
int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
|
||||||
|
|
||||||
|
if (chunk == MAX_TASK_QUEUE_CHUNKS) {
|
||||||
|
fprintf(stderr, "A total of %d tasks have been launched from the "
|
||||||
|
"current function--the simple built-in task system can handle "
|
||||||
|
"no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
|
||||||
|
"and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation. "
|
||||||
|
"Sorry! Exiting.\n", index);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (taskInfo[chunk] == NULL)
|
||||||
|
taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
|
||||||
|
return &taskInfo[chunk][offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void *
|
||||||
|
TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
|
||||||
|
char *basePtr = memBuffers[curMemBuffer];
|
||||||
|
int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
|
||||||
|
iptr = (iptr + (alignment-1)) & ~(alignment-1);
|
||||||
|
|
||||||
|
int newOffset = int(iptr + size - (int64_t)basePtr);
|
||||||
|
if (newOffset < memBufferSize[curMemBuffer]) {
|
||||||
|
curMemBufferOffset = newOffset;
|
||||||
|
return (char *)iptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
++curMemBuffer;
|
||||||
|
curMemBufferOffset = 0;
|
||||||
|
assert(curMemBuffer < NUM_MEM_BUFFERS);
|
||||||
|
|
||||||
|
int allocSize = 1 << (12 + curMemBuffer);
|
||||||
|
allocSize = std::max(int(size+alignment), allocSize);
|
||||||
|
char *newBuf = new char[allocSize];
|
||||||
|
memBufferSize[curMemBuffer] = allocSize;
|
||||||
|
memBuffers[curMemBuffer] = newBuf;
|
||||||
|
return AllocMemory(size, alignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Atomics and the like
|
||||||
|
|
||||||
|
#ifndef ISPC_IS_WINDOWS
|
||||||
|
static inline void
|
||||||
|
lMemFence() {
|
||||||
|
__asm__ __volatile__("mfence":::"memory");
|
||||||
|
}
|
||||||
|
#endif // !ISPC_IS_WINDOWS
|
||||||
|
|
||||||
|
|
||||||
|
#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
|
||||||
|
#define ISPC_POINTER_BYTES 4
|
||||||
|
#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
|
||||||
|
#define ISPC_POINTER_BYTES 8
|
||||||
|
#else
|
||||||
|
#error "Pointer size unknown!"
|
||||||
|
#endif // __SIZEOF_POINTER__
|
||||||
|
|
||||||
|
|
||||||
|
static void *
|
||||||
|
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
return InterlockedCompareExchangePointer(v, newValue, oldValue);
|
||||||
|
#else
|
||||||
|
void *result;
|
||||||
|
#if (ISPC_POINTER_BYTES == 4)
|
||||||
|
__asm__ __volatile__("lock\ncmpxchgd %2,%1"
|
||||||
|
: "=a"(result), "=m"(*v)
|
||||||
|
: "q"(newValue), "0"(oldValue)
|
||||||
|
: "memory");
|
||||||
|
#else
|
||||||
|
__asm__ __volatile__("lock\ncmpxchgq %2,%1"
|
||||||
|
: "=a"(result), "=m"(*v)
|
||||||
|
: "q"(newValue), "0"(oldValue)
|
||||||
|
: "memory");
|
||||||
|
#endif // ISPC_POINTER_BYTES
|
||||||
|
lMemFence();
|
||||||
|
return result;
|
||||||
|
#endif // ISPC_IS_WINDOWS
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef ISPC_IS_WINDOWS
|
||||||
|
static int32_t
|
||||||
|
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
|
||||||
|
int32_t result;
|
||||||
|
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
|
||||||
|
: "=a"(result), "=m"(*v)
|
||||||
|
: "q"(newValue), "0"(oldValue)
|
||||||
|
: "memory");
|
||||||
|
lMemFence();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#endif // !ISPC_IS_WINDOWS
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#ifdef ISPC_USE_CONCRT
|
||||||
|
// With ConcRT, we don't need to extend TaskGroupBase at all.
|
||||||
|
class TaskGroup : public TaskGroupBase {
|
||||||
|
public:
|
||||||
|
void Launch(int baseIndex, int count);
|
||||||
|
void Sync();
|
||||||
|
};
|
||||||
|
#endif // ISPC_USE_CONCRT
|
||||||
|
|
||||||
|
#ifdef ISPC_USE_GCD
|
||||||
|
/* With Grand Central Dispatch, we associate a GCD dispatch group with each
|
||||||
|
task group. (We'll later wait on this dispatch group when we need to
|
||||||
|
wait on all of the tasks in the group to finish.)
|
||||||
|
*/
|
||||||
|
class TaskGroup : public TaskGroupBase {
|
||||||
|
public:
|
||||||
|
TaskGroup() {
|
||||||
|
gcdGroup = dispatch_group_create();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Launch(int baseIndex, int count);
|
||||||
|
void Sync();
|
||||||
|
|
||||||
|
private:
|
||||||
|
dispatch_group_t gcdGroup;
|
||||||
|
};
|
||||||
|
#endif // ISPC_USE_GCD
|
||||||
|
|
||||||
|
#ifdef ISPC_USE_PTHREADS
|
||||||
|
static void *lTaskEntry(void *arg);
|
||||||
|
|
||||||
|
class TaskGroup : public TaskGroupBase {
|
||||||
|
public:
|
||||||
|
TaskGroup() {
|
||||||
|
numUnfinishedTasks = 0;
|
||||||
|
waitingTasks.reserve(128);
|
||||||
|
inActiveList = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Reset() {
|
||||||
|
TaskGroupBase::Reset();
|
||||||
|
numUnfinishedTasks = 0;
|
||||||
|
assert(inActiveList == false);
|
||||||
|
lMemFence();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Launch(int baseIndex, int count);
|
||||||
|
void Sync();
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend void *lTaskEntry(void *arg);
|
||||||
|
|
||||||
|
int32_t numUnfinishedTasks;
|
||||||
|
int32_t pad[3];
|
||||||
|
std::vector<int> waitingTasks;
|
||||||
|
bool inActiveList;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // ISPC_USE_PTHREADS
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Grand Central Dispatch
|
||||||
|
|
||||||
|
#ifdef ISPC_USE_GCD
|
||||||
|
|
||||||
|
/* A simple task system for ispc programs based on Apple's Grand Central
|
||||||
|
Dispatch. */
|
||||||
|
|
||||||
|
static dispatch_queue_t gcdQueue;
|
||||||
|
static volatile int32_t lock = 0;
|
||||||
|
|
||||||
|
static void
|
||||||
|
InitTaskSystem() {
|
||||||
|
if (gcdQueue != NULL)
|
||||||
|
return;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
||||||
|
if (gcdQueue == NULL) {
|
||||||
|
gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
||||||
|
assert(gcdQueue != NULL);
|
||||||
|
lMemFence();
|
||||||
|
}
|
||||||
|
lock = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
lRunTask(void *ti) {
|
||||||
|
TaskInfo *taskInfo = (TaskInfo *)ti;
|
||||||
|
// FIXME: these are bogus values; may cause bugs in code that depends
|
||||||
|
// on them having unique values in different threads.
|
||||||
|
int threadIndex = 0;
|
||||||
|
int threadCount = 1;
|
||||||
|
|
||||||
|
// Actually run the task
|
||||||
|
taskInfo->func(taskInfo->data, threadIndex, threadCount,
|
||||||
|
taskInfo->taskIndex, taskInfo->taskCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void
|
||||||
|
TaskGroup::Launch(int baseIndex, int count) {
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||||
|
dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void
|
||||||
|
TaskGroup::Sync() {
|
||||||
|
dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // ISPC_USE_GCD
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Concurrency Runtime
|
||||||
|
|
||||||
|
#ifdef ISPC_USE_CONCRT
|
||||||
|
|
||||||
|
static void
|
||||||
|
InitTaskSystem() {
|
||||||
|
// No initialization needed
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void __cdecl
|
||||||
|
lRunTask(LPVOID param) {
|
||||||
|
TaskInfo *ti = (TaskInfo *)param;
|
||||||
|
|
||||||
|
// Actually run the task.
|
||||||
|
// FIXME: like the GCD implementation for OS X, this is passing bogus
|
||||||
|
// values for the threadIndex and threadCount builtins, which in turn
|
||||||
|
// will cause bugs in code that uses those.
|
||||||
|
int threadIndex = 0;
|
||||||
|
int threadCount = 1;
|
||||||
|
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
|
||||||
|
|
||||||
|
// Signal the event that this task is done
|
||||||
|
ti->taskEvent.set();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void
|
||||||
|
TaskGroup::Launch(int baseIndex, int count) {
|
||||||
|
for (int i = 0; i < count; ++i)
|
||||||
|
CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void
|
||||||
|
TaskGroup::Sync() {
|
||||||
|
for (int i = 0; i < nextTaskInfoIndex; ++i) {
|
||||||
|
TaskInfo *ti = GetTaskInfo(i);
|
||||||
|
ti->taskEvent.wait();
|
||||||
|
ti->taskEvent.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // ISPC_USE_CONCRT
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// pthreads
|
||||||
|
|
||||||
|
#ifdef ISPC_USE_PTHREADS
|
||||||
|
|
||||||
|
static volatile int32_t lock = 0;
|
||||||
|
|
||||||
|
static int nThreads;
|
||||||
|
static pthread_t *threads = NULL;
|
||||||
|
|
||||||
|
static pthread_mutex_t taskSysMutex;
|
||||||
|
static std::vector<TaskGroup *> activeTaskGroups;
|
||||||
|
static sem_t *workerSemaphore;
|
||||||
|
|
||||||
|
|
||||||
|
static inline int32_t
|
||||||
|
lAtomicAdd(int32_t *v, int32_t delta) {
|
||||||
|
int32_t origValue;
|
||||||
|
__asm__ __volatile__("lock\n"
|
||||||
|
"xaddl %0,%1"
|
||||||
|
: "=r"(origValue), "=m"(*v) : "0"(delta)
|
||||||
|
: "memory");
|
||||||
|
return origValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void *
|
||||||
|
lTaskEntry(void *arg) {
|
||||||
|
int threadIndex = (int)((int64_t)arg);
|
||||||
|
int threadCount = nThreads;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
int err;
|
||||||
|
//
|
||||||
|
// Wait on the semaphore until we're woken up due to the arrival of
|
||||||
|
// more work.
|
||||||
|
//
|
||||||
|
if ((err = sem_wait(workerSemaphore)) != 0) {
|
||||||
|
fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Acquire the mutex
|
||||||
|
//
|
||||||
|
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||||
|
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (activeTaskGroups.size() == 0) {
|
||||||
|
//
|
||||||
|
// Task queue is empty, go back and wait on the semaphore
|
||||||
|
//
|
||||||
|
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||||
|
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Get the last task group on the active list and the last task
|
||||||
|
// from its waiting tasks list.
|
||||||
|
//
|
||||||
|
TaskGroup *tg = activeTaskGroups.back();
|
||||||
|
assert(tg->waitingTasks.size() > 0);
|
||||||
|
int taskNumber = tg->waitingTasks.back();
|
||||||
|
tg->waitingTasks.pop_back();
|
||||||
|
|
||||||
|
if (tg->waitingTasks.size() == 0) {
|
||||||
|
// We just took the last task from this task group, so remove
|
||||||
|
// it from the active list.
|
||||||
|
activeTaskGroups.pop_back();
|
||||||
|
tg->inActiveList = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||||
|
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// And now actually run the task
|
||||||
|
//
|
||||||
|
DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
|
||||||
|
TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
|
||||||
|
myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
|
||||||
|
myTask->taskCount);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Decrement the "number of unfinished tasks" counter in the task
|
||||||
|
// group.
|
||||||
|
//
|
||||||
|
lMemFence();
|
||||||
|
lAtomicAdd(&tg->numUnfinishedTasks, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
pthread_exit(NULL);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
InitTaskSystem() {
|
||||||
|
if (threads == NULL) {
|
||||||
|
while (1) {
|
||||||
|
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
||||||
|
if (threads == NULL) {
|
||||||
|
// We launch one fewer thread than there are cores,
|
||||||
|
// since the main thread here will also grab jobs from
|
||||||
|
// the task queue itself.
|
||||||
|
nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
|
||||||
|
|
||||||
|
int err;
|
||||||
|
if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {
|
||||||
|
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
char name[32];
|
||||||
|
sprintf(name, "ispc_task.%d", (int)getpid());
|
||||||
|
workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
|
||||||
|
if (!workerSemaphore) {
|
||||||
|
fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
|
||||||
|
for (int i = 0; i < nThreads; ++i) {
|
||||||
|
err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
|
||||||
|
if (err != 0) {
|
||||||
|
fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
activeTaskGroups.reserve(64);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure all of the above goes to memory before we
|
||||||
|
// clear the lock.
|
||||||
|
lMemFence();
|
||||||
|
lock = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void
|
||||||
|
TaskGroup::Launch(int baseCoord, int count) {
|
||||||
|
//
|
||||||
|
// Acquire mutex, add task
|
||||||
|
//
|
||||||
|
int err;
|
||||||
|
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||||
|
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the corresponding set of tasks to the waiting-to-be-run list for
|
||||||
|
// this task group.
|
||||||
|
//
|
||||||
|
// FIXME: it's a little ugly to hold a global mutex for this when we
|
||||||
|
// only need to make sure no one else is accessing this task group's
|
||||||
|
// waitingTasks list. (But a small experiment in switching to a
|
||||||
|
// per-TaskGroup mutex showed worse performance!)
|
||||||
|
for (int i = 0; i < count; ++i)
|
||||||
|
waitingTasks.push_back(baseCoord + i);
|
||||||
|
|
||||||
|
// Add the task group to the global active list if it isn't there
|
||||||
|
// already.
|
||||||
|
if (inActiveList == false) {
|
||||||
|
activeTaskGroups.push_back(this);
|
||||||
|
inActiveList = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||||
|
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Update the count of the number of tasks left to run in this task
|
||||||
|
// group.
|
||||||
|
//
|
||||||
|
lMemFence();
|
||||||
|
lAtomicAdd(&numUnfinishedTasks, count);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Post to the worker semaphore to wake up worker threads that are
|
||||||
|
// sleeping waiting for tasks to show up
|
||||||
|
//
|
||||||
|
for (int i = 0; i < count; ++i)
|
||||||
|
if ((err = sem_post(workerSemaphore)) != 0) {
|
||||||
|
fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void
|
||||||
|
TaskGroup::Sync() {
|
||||||
|
DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks));
|
||||||
|
|
||||||
|
while (numUnfinishedTasks > 0) {
|
||||||
|
// All of the tasks in this group aren't finished yet. We'll try
|
||||||
|
// to help out here since we don't have anything else to do...
|
||||||
|
|
||||||
|
DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg,
|
||||||
|
numUnfinishedTasks));
|
||||||
|
|
||||||
|
//
|
||||||
|
// Acquire the global task system mutex to grab a task to work on
|
||||||
|
//
|
||||||
|
int err;
|
||||||
|
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||||
|
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
TaskInfo *myTask = NULL;
|
||||||
|
TaskGroup *runtg = this;
|
||||||
|
if (waitingTasks.size() > 0) {
|
||||||
|
int taskNumber = waitingTasks.back();
|
||||||
|
waitingTasks.pop_back();
|
||||||
|
|
||||||
|
if (waitingTasks.size() == 0) {
|
||||||
|
// There's nothing left to start running from this group,
|
||||||
|
// so remove it from the active task list.
|
||||||
|
activeTaskGroups.erase(std::find(activeTaskGroups.begin(),
|
||||||
|
activeTaskGroups.end(), this));
|
||||||
|
inActiveList = false;
|
||||||
|
}
|
||||||
|
myTask = GetTaskInfo(taskNumber);
|
||||||
|
DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Other threads are already working on all of the tasks in
|
||||||
|
// this group, so we can't help out by running one ourself.
|
||||||
|
// We'll try to run one from another group to make ourselves
|
||||||
|
// useful here.
|
||||||
|
if (activeTaskGroups.size() == 0) {
|
||||||
|
// No active task groups left--there's nothing for us to do.
|
||||||
|
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||||
|
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
// FIXME: We basically end up busy-waiting here, which is
|
||||||
|
// extra wasteful in a world with hyperthreading. It would
|
||||||
|
// be much better to put this thread to sleep on a
|
||||||
|
// condition variable that was signaled when the last task
|
||||||
|
// in this group was finished.
|
||||||
|
sleep(0);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a task to run from another task group.
|
||||||
|
runtg = activeTaskGroups.back();
|
||||||
|
assert(runtg->waitingTasks.size() > 0);
|
||||||
|
|
||||||
|
int taskNumber = runtg->waitingTasks.back();
|
||||||
|
runtg->waitingTasks.pop_back();
|
||||||
|
if (runtg->waitingTasks.size() == 0) {
|
||||||
|
// There's left to start running from this group, so remove
|
||||||
|
// it from the active task list.
|
||||||
|
activeTaskGroups.pop_back();
|
||||||
|
runtg->inActiveList = false;
|
||||||
|
}
|
||||||
|
myTask = runtg->GetTaskInfo(taskNumber);
|
||||||
|
DBG(fprintf(stderr, "running task %d from other group %p in sync\n",
|
||||||
|
taskNumber, runtg));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||||
|
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Do work for _myTask_
|
||||||
|
//
|
||||||
|
// FIXME: bogus values for thread index/thread count here as well..
|
||||||
|
myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Decrement the number of unfinished tasks counter
|
||||||
|
//
|
||||||
|
lMemFence();
|
||||||
|
lAtomicAdd(&runtg->numUnfinishedTasks, -1);
|
||||||
|
}
|
||||||
|
DBG(fprintf(stderr, "sync for %p done!n", tg));
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // ISPC_USE_PTHREADS
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#define MAX_FREE_TASK_GROUPS 64
|
||||||
|
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
|
||||||
|
|
||||||
|
static inline TaskGroup *
|
||||||
|
AllocTaskGroup() {
|
||||||
|
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||||
|
TaskGroup *tg = freeTaskGroups[i];
|
||||||
|
if (tg != NULL) {
|
||||||
|
void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
|
||||||
|
if (ptr != NULL) {
|
||||||
|
assert(ptr == tg);
|
||||||
|
return (TaskGroup *)ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TaskGroup;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
FreeTaskGroup(TaskGroup *tg) {
|
||||||
|
tg->Reset();
|
||||||
|
|
||||||
|
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||||
|
if (freeTaskGroups[i] == NULL) {
|
||||||
|
void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
|
||||||
|
if (ptr == NULL)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete tg;
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// ispc expects these functions to have C linkage / not be mangled
|
||||||
|
extern "C" {
|
||||||
|
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
|
||||||
|
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||||
|
void ISPCSync(void *handle);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
|
||||||
|
TaskGroup *taskGroup;
|
||||||
|
if (*taskGroupPtr == NULL) {
|
||||||
|
InitTaskSystem();
|
||||||
|
taskGroup = AllocTaskGroup();
|
||||||
|
*taskGroupPtr = taskGroup;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
taskGroup = (TaskGroup *)(*taskGroupPtr);
|
||||||
|
|
||||||
|
int baseIndex = taskGroup->AllocTaskInfo(count);
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
|
||||||
|
ti->func = (TaskFuncType)func;
|
||||||
|
ti->data = data;
|
||||||
|
ti->taskIndex = i;
|
||||||
|
ti->taskCount = count;
|
||||||
|
}
|
||||||
|
taskGroup->Launch(baseIndex, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
ISPCSync(void *h) {
|
||||||
|
TaskGroup *taskGroup = (TaskGroup *)h;
|
||||||
|
if (taskGroup != NULL) {
|
||||||
|
taskGroup->Sync();
|
||||||
|
FreeTaskGroup(taskGroup);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void *
|
||||||
|
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
|
||||||
|
TaskGroup *taskGroup;
|
||||||
|
if (*taskGroupPtr == NULL) {
|
||||||
|
InitTaskSystem();
|
||||||
|
taskGroup = AllocTaskGroup();
|
||||||
|
*taskGroupPtr = taskGroup;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
taskGroup = (TaskGroup *)(*taskGroupPtr);
|
||||||
|
|
||||||
|
return taskGroup->AllocMemory(size, alignment);
|
||||||
|
}
|
||||||
@@ -38,7 +38,9 @@
|
|||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#define rdtsc __rdtsc
|
#define rdtsc __rdtsc
|
||||||
#else
|
#else
|
||||||
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
#endif /* __cplusplus */
|
||||||
__inline__ uint64_t rdtsc() {
|
__inline__ uint64_t rdtsc() {
|
||||||
uint32_t low, high;
|
uint32_t low, high;
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
@@ -48,7 +50,9 @@ extern "C" {
|
|||||||
"rdtsc" : "=a" (low), "=d" (high));
|
"rdtsc" : "=a" (low), "=d" (high));
|
||||||
return (uint64_t)high << 32 | low;
|
return (uint64_t)high << 32 | low;
|
||||||
}
|
}
|
||||||
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
#endif /* __cplusplus */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static uint64_t start, end;
|
static uint64_t start, end;
|
||||||
|
|||||||
@@ -1,14 +1,8 @@
|
|||||||
|
|
||||||
ARCH = $(shell uname)
|
ARCH = $(shell uname)
|
||||||
|
|
||||||
TASK_CXX=../tasks_pthreads.cpp
|
TASK_CXX=../tasksys.cpp
|
||||||
TASK_LIB=-lpthread
|
TASK_LIB=-lpthread
|
||||||
|
|
||||||
ifeq ($(ARCH), Darwin)
|
|
||||||
TASK_CXX=../tasks_gcd.cpp
|
|
||||||
TASK_LIB=
|
|
||||||
endif
|
|
||||||
|
|
||||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||||
|
|
||||||
CXX=g++
|
CXX=g++
|
||||||
|
|||||||
@@ -343,11 +343,20 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
|||||||
|
|
||||||
|
|
||||||
task void
|
task void
|
||||||
volume_task(uniform int x0, uniform int y0, uniform int x1,
|
volume_task(uniform float density[], uniform int nVoxels[3],
|
||||||
uniform int y1, uniform float density[], uniform int nVoxels[3],
|
|
||||||
const uniform float raster2camera[4][4],
|
const uniform float raster2camera[4][4],
|
||||||
const uniform float camera2world[4][4],
|
const uniform float camera2world[4][4],
|
||||||
uniform int width, uniform int height, uniform float image[]) {
|
uniform int width, uniform int height, uniform float image[]) {
|
||||||
|
uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
|
||||||
|
uniform int xbuckets = (width + (dx-1)) / dx;
|
||||||
|
uniform int ybuckets = (height + (dy-1)) / dy;
|
||||||
|
|
||||||
|
uniform int x0 = (taskIndex % xbuckets) * dx;
|
||||||
|
uniform int y0 = (taskIndex / ybuckets) * dy;
|
||||||
|
uniform int x1 = x0 + dx, y1 = y0 + dy;
|
||||||
|
x1 = min(x1, width);
|
||||||
|
y1 = min(y1, height);
|
||||||
|
|
||||||
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
|
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
|
||||||
camera2world, width, height, image);
|
camera2world, width, height, image);
|
||||||
}
|
}
|
||||||
@@ -370,9 +379,7 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
|
|||||||
uniform int width, uniform int height, uniform float image[]) {
|
uniform int width, uniform int height, uniform float image[]) {
|
||||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||||
uniform int dx = 8, dy = 8;
|
uniform int dx = 8, dy = 8;
|
||||||
for (uniform int y = 0; y < height; y += dy)
|
uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||||
for (uniform int x = 0; x < width; x += dx)
|
launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world,
|
||||||
launch < volume_task(x, y, x+dx, y+dy, density, nVoxels,
|
width, height, image) >;
|
||||||
raster2camera, camera2world, width, height,
|
|
||||||
image) >;
|
|
||||||
}
|
}
|
||||||
|
|||||||
2
examples/volume_rendering/volume.vcxproj
Executable file → Normal file
2
examples/volume_rendering/volume.vcxproj
Executable file → Normal file
@@ -143,7 +143,7 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="volume.cpp" />
|
<ClCompile Include="volume.cpp" />
|
||||||
<ClCompile Include="volume_serial.cpp" />
|
<ClCompile Include="volume_serial.cpp" />
|
||||||
<ClCompile Include="../tasks_concrt.cpp" />
|
<ClCompile Include="../tasksys.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="volume.ispc">
|
<CustomBuild Include="volume.ispc">
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ Inside(float3 p, float3 pMin, float3 pMax) {
|
|||||||
|
|
||||||
static bool
|
static bool
|
||||||
IntersectP(const Ray &ray, float3 pMin, float3 pMax, float *hit0, float *hit1) {
|
IntersectP(const Ray &ray, float3 pMin, float3 pMax, float *hit0, float *hit1) {
|
||||||
float t0 = -1e30, t1 = 1e30;
|
float t0 = -1e30f, t1 = 1e30f;
|
||||||
|
|
||||||
float3 tNear = (pMin - ray.origin) / ray.dir;
|
float3 tNear = (pMin - ray.origin) / ray.dir;
|
||||||
float3 tFar = (pMax - ray.origin) / ray.dir;
|
float3 tFar = (pMax - ray.origin) / ray.dir;
|
||||||
@@ -213,7 +213,7 @@ transmittance(float3 p0, float3 p1, float3 pMin,
|
|||||||
float tau = 0;
|
float tau = 0;
|
||||||
float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||||
ray.dir.z * ray.dir.z);
|
ray.dir.z * ray.dir.z);
|
||||||
float stepDist = 0.2;
|
float stepDist = 0.2f;
|
||||||
float stepT = stepDist / rayLength;
|
float stepT = stepDist / rayLength;
|
||||||
|
|
||||||
float t = rayT0;
|
float t = rayT0;
|
||||||
@@ -239,8 +239,8 @@ distanceSquared(float3 a, float3 b) {
|
|||||||
static float
|
static float
|
||||||
raymarch(float density[], int nVoxels[3], const Ray &ray) {
|
raymarch(float density[], int nVoxels[3], const Ray &ray) {
|
||||||
float rayT0, rayT1;
|
float rayT0, rayT1;
|
||||||
float3 pMin(.3, -.2, .3), pMax(1.8, 2.3, 1.8);
|
float3 pMin(.3f, -.2f, .3f), pMax(1.8f, 2.3f, 1.8f);
|
||||||
float3 lightPos(-1, 4, 1.5);
|
float3 lightPos(-1.f, 4.f, 1.5f);
|
||||||
|
|
||||||
if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
|
if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
|
||||||
return 0.;
|
return 0.;
|
||||||
@@ -249,10 +249,10 @@ raymarch(float density[], int nVoxels[3], const Ray &ray) {
|
|||||||
|
|
||||||
// Parameters that define the volume scattering characteristics and
|
// Parameters that define the volume scattering characteristics and
|
||||||
// sampling rate for raymarching
|
// sampling rate for raymarching
|
||||||
float Le = .25; // Emission coefficient
|
float Le = .25f; // Emission coefficient
|
||||||
float sigma_a = 10; // Absorption coefficient
|
float sigma_a = 10; // Absorption coefficient
|
||||||
float sigma_s = 10; // Scattering coefficient
|
float sigma_s = 10; // Scattering coefficient
|
||||||
float stepDist = 0.025; // Ray step amount
|
float stepDist = 0.025f; // Ray step amount
|
||||||
float lightIntensity = 40; // Light source intensity
|
float lightIntensity = 40; // Light source intensity
|
||||||
|
|
||||||
float tau = 0.f; // accumulated beam transmittance
|
float tau = 0.f; // accumulated beam transmittance
|
||||||
@@ -269,7 +269,7 @@ raymarch(float density[], int nVoxels[3], const Ray &ray) {
|
|||||||
|
|
||||||
// terminate once attenuation is high
|
// terminate once attenuation is high
|
||||||
float atten = expf(-tau);
|
float atten = expf(-tau);
|
||||||
if (atten < .005)
|
if (atten < .005f)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// direct lighting
|
// direct lighting
|
||||||
|
|||||||
336
expr.cpp
336
expr.cpp
@@ -741,6 +741,12 @@ UnaryExpr::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
UnaryExpr::EstimateCost() const {
|
||||||
|
return (expr ? expr->EstimateCost() : 0) + COST_SIMPLE_ARITH_LOGIC_OP;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
UnaryExpr::Print() const {
|
UnaryExpr::Print() const {
|
||||||
if (!expr || !GetType())
|
if (!expr || !GetType())
|
||||||
@@ -799,11 +805,17 @@ lOpString(BinaryExpr::Op op) {
|
|||||||
*/
|
*/
|
||||||
static llvm::Value *
|
static llvm::Value *
|
||||||
lEmitBinaryBitOp(BinaryExpr::Op op, llvm::Value *arg0Val,
|
lEmitBinaryBitOp(BinaryExpr::Op op, llvm::Value *arg0Val,
|
||||||
llvm::Value *arg1Val, FunctionEmitContext *ctx) {
|
llvm::Value *arg1Val, bool isUnsigned,
|
||||||
|
FunctionEmitContext *ctx) {
|
||||||
llvm::Instruction::BinaryOps inst;
|
llvm::Instruction::BinaryOps inst;
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case BinaryExpr::Shl: inst = llvm::Instruction::Shl; break;
|
case BinaryExpr::Shl: inst = llvm::Instruction::Shl; break;
|
||||||
case BinaryExpr::Shr: inst = llvm::Instruction::AShr; break;
|
case BinaryExpr::Shr:
|
||||||
|
if (isUnsigned)
|
||||||
|
inst = llvm::Instruction::LShr;
|
||||||
|
else
|
||||||
|
inst = llvm::Instruction::AShr;
|
||||||
|
break;
|
||||||
case BinaryExpr::BitAnd: inst = llvm::Instruction::And; break;
|
case BinaryExpr::BitAnd: inst = llvm::Instruction::And; break;
|
||||||
case BinaryExpr::BitXor: inst = llvm::Instruction::Xor; break;
|
case BinaryExpr::BitXor: inst = llvm::Instruction::Xor; break;
|
||||||
case BinaryExpr::BitOr: inst = llvm::Instruction::Or; break;
|
case BinaryExpr::BitOr: inst = llvm::Instruction::Or; break;
|
||||||
@@ -949,7 +961,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
|
|||||||
dynamic_cast<ConstExpr *>(arg1) == NULL)
|
dynamic_cast<ConstExpr *>(arg1) == NULL)
|
||||||
PerformanceWarning(pos, "Shift right is extremely inefficient for "
|
PerformanceWarning(pos, "Shift right is extremely inefficient for "
|
||||||
"varying shift amounts.");
|
"varying shift amounts.");
|
||||||
return lEmitBinaryBitOp(op, e0Val, e1Val, ctx);
|
return lEmitBinaryBitOp(op, e0Val, e1Val,
|
||||||
|
arg0->GetType()->IsUnsignedType(), ctx);
|
||||||
}
|
}
|
||||||
case LogicalAnd:
|
case LogicalAnd:
|
||||||
return ctx->BinaryOperator(llvm::Instruction::And, e0Val, e1Val,
|
return ctx->BinaryOperator(llvm::Instruction::And, e0Val, e1Val,
|
||||||
@@ -1176,10 +1189,10 @@ BinaryExpr::Optimize() {
|
|||||||
m->symbolTable->LookupFunction("rcp");
|
m->symbolTable->LookupFunction("rcp");
|
||||||
if (rcpFuns != NULL) {
|
if (rcpFuns != NULL) {
|
||||||
assert(rcpFuns->size() == 2);
|
assert(rcpFuns->size() == 2);
|
||||||
Expr *rcpSymExpr = new FunctionSymbolExpr(rcpFuns, pos);
|
Expr *rcpSymExpr = new FunctionSymbolExpr("rcp", rcpFuns, pos);
|
||||||
ExprList *args = new ExprList(arg1, arg1->pos);
|
ExprList *args = new ExprList(arg1, arg1->pos);
|
||||||
Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args,
|
Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args,
|
||||||
arg1->pos, false);
|
arg1->pos);
|
||||||
rcpCall = rcpCall->TypeCheck();
|
rcpCall = rcpCall->TypeCheck();
|
||||||
if (rcpCall == NULL)
|
if (rcpCall == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -1292,6 +1305,17 @@ BinaryExpr::TypeCheck() {
|
|||||||
if (type0 == NULL || type1 == NULL)
|
if (type0 == NULL || type1 == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
if (dynamic_cast<const ReferenceType *>(type0) != NULL) {
|
||||||
|
arg0 = new DereferenceExpr(arg0, arg0->pos);
|
||||||
|
type0 = arg0->GetType();
|
||||||
|
assert(type0 != NULL);
|
||||||
|
}
|
||||||
|
if (dynamic_cast<const ReferenceType *>(type1) != NULL) {
|
||||||
|
arg1 = new DereferenceExpr(arg1, arg1->pos);
|
||||||
|
type1 = arg1->GetType();
|
||||||
|
assert(type1 != NULL);
|
||||||
|
}
|
||||||
|
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case Shl:
|
case Shl:
|
||||||
case Shr:
|
case Shr:
|
||||||
@@ -1438,6 +1462,15 @@ BinaryExpr::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
BinaryExpr::EstimateCost() const {
|
||||||
|
return ((arg0 ? arg0->EstimateCost() : 0) +
|
||||||
|
(arg1 ? arg1->EstimateCost() : 0) +
|
||||||
|
((op == Div || op == Mod) ? COST_COMPLEX_ARITH_OP :
|
||||||
|
COST_SIMPLE_ARITH_LOGIC_OP));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
BinaryExpr::Print() const {
|
BinaryExpr::Print() const {
|
||||||
if (!arg0 || !arg1 || !GetType())
|
if (!arg0 || !arg1 || !GetType())
|
||||||
@@ -1533,7 +1566,8 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
|
|||||||
case AssignExpr::AndAssign:
|
case AssignExpr::AndAssign:
|
||||||
case AssignExpr::XorAssign:
|
case AssignExpr::XorAssign:
|
||||||
case AssignExpr::OrAssign:
|
case AssignExpr::OrAssign:
|
||||||
newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, ctx);
|
newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue,
|
||||||
|
arg0->GetType()->IsUnsignedType(), ctx);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
FATAL("logic error in lEmitOpAssign");
|
FATAL("logic error in lEmitOpAssign");
|
||||||
@@ -1688,6 +1722,20 @@ AssignExpr::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
AssignExpr::EstimateCost() const {
|
||||||
|
int cost = ((lvalue ? lvalue->EstimateCost() : 0) +
|
||||||
|
(rvalue ? rvalue->EstimateCost() : 0));
|
||||||
|
cost += COST_ASSIGN;
|
||||||
|
if (op == Assign)
|
||||||
|
return cost;
|
||||||
|
if (op == DivAssign || op == ModAssign)
|
||||||
|
return cost + COST_COMPLEX_ARITH_OP;
|
||||||
|
else
|
||||||
|
return cost + COST_SIMPLE_ARITH_LOGIC_OP;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
AssignExpr::Print() const {
|
AssignExpr::Print() const {
|
||||||
if (!lvalue || !rvalue || !GetType())
|
if (!lvalue || !rvalue || !GetType())
|
||||||
@@ -1936,6 +1984,12 @@ SelectExpr::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
SelectExpr::EstimateCost() const {
|
||||||
|
return COST_SELECT;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
SelectExpr::Print() const {
|
SelectExpr::Print() const {
|
||||||
if (!test || !expr1 || !expr2 || !GetType())
|
if (!test || !expr1 || !expr2 || !GetType())
|
||||||
@@ -2159,7 +2213,7 @@ FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) {
|
|||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
FunctionCallExpr::resolveFunctionOverloads() {
|
FunctionCallExpr::resolveFunctionOverloads(bool exactMatchOnly) {
|
||||||
FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
|
FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
|
||||||
if (!fse)
|
if (!fse)
|
||||||
// error will be issued later if not calling an actual function
|
// error will be issued later if not calling an actual function
|
||||||
@@ -2173,6 +2227,7 @@ FunctionCallExpr::resolveFunctionOverloads() {
|
|||||||
if (tryResolve(lExactMatch))
|
if (tryResolve(lExactMatch))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (!exactMatchOnly) {
|
||||||
// Try to find a single match ignoring references
|
// Try to find a single match ignoring references
|
||||||
if (tryResolve(lMatchIgnoringReferences))
|
if (tryResolve(lMatchIgnoringReferences))
|
||||||
return;
|
return;
|
||||||
@@ -2193,73 +2248,34 @@ FunctionCallExpr::resolveFunctionOverloads() {
|
|||||||
// Last chance: try to find a match via arbitrary type conversion.
|
// Last chance: try to find a match via arbitrary type conversion.
|
||||||
if (tryResolve(lMatchWithTypeConv))
|
if (tryResolve(lMatchWithTypeConv))
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// failure :-(
|
// failure :-(
|
||||||
const char *funName = fse->candidateFunctions->front()->name.c_str();
|
const char *funName = fse->candidateFunctions->front()->name.c_str();
|
||||||
Error(pos, "Unable to find matching overload for call to function \"%s\".",
|
Error(pos, "Unable to find matching overload for call to function \"%s\"%s.",
|
||||||
funName);
|
funName, exactMatchOnly ? " only considering exact matches" : "");
|
||||||
fprintf(stderr, "Candidates are:\n");
|
fprintf(stderr, "Candidates are:\n");
|
||||||
lPrintFunctionOverloads(*fse->candidateFunctions);
|
lPrintFunctionOverloads(*fse->candidateFunctions);
|
||||||
lPrintPassedTypes(funName, args->exprs);
|
lPrintPassedTypes(funName, args->exprs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il)
|
FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p,
|
||||||
: Expr(p) {
|
bool il, Expr *lce)
|
||||||
|
: Expr(p), isLaunch(il) {
|
||||||
func = f;
|
func = f;
|
||||||
args = a;
|
args = a;
|
||||||
isLaunch = il;
|
launchCountExpr = lce;
|
||||||
|
|
||||||
resolveFunctionOverloads();
|
FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
|
||||||
}
|
// Functions with names that start with "__" should only be various
|
||||||
|
// builtins. For those, we'll demand an exact match, since we'll
|
||||||
|
// expect whichever function in stdlib.ispc is calling out to one of
|
||||||
/** Starting from the function initialFunction, we're calling into
|
// those to be matching the argument types exactly; this is to be a bit
|
||||||
calledFunc. The question is: is this a recursive call back to
|
// extra safe to be sure that the expected builtin is in fact being
|
||||||
initialFunc? If it definitely is or if it may be, then return true.
|
// called.
|
||||||
Return false if it definitely is not.
|
bool exactMatchOnly = (fse != NULL) && (fse->name.substr(0,2) == "__");
|
||||||
*/
|
resolveFunctionOverloads(exactMatchOnly);
|
||||||
static bool
|
|
||||||
lMayBeRecursiveCall(llvm::Function *calledFunc,
|
|
||||||
llvm::Function *initialFunc,
|
|
||||||
std::set<llvm::Function *> &seenFuncs) {
|
|
||||||
// Easy case: intrinsics aren't going to call functions themselves
|
|
||||||
if (calledFunc->isIntrinsic())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
std::string name = calledFunc->getName();
|
|
||||||
if (name.size() > 2 && name[0] == '_' && name[1] == '_')
|
|
||||||
// builtin stdlib function; none of these are recursive...
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (calledFunc->isDeclaration())
|
|
||||||
// There's visibility into what the called function does without a
|
|
||||||
// definition, so we have to be conservative
|
|
||||||
return true;
|
|
||||||
|
|
||||||
if (calledFunc == initialFunc)
|
|
||||||
// hello recursive call
|
|
||||||
return true;
|
|
||||||
|
|
||||||
// Otherwise iterate over all of the instructions in the function. If
|
|
||||||
// any of them is a function call then check recursively..
|
|
||||||
llvm::inst_iterator iter;
|
|
||||||
for (iter = llvm::inst_begin(calledFunc);
|
|
||||||
iter != llvm::inst_end(calledFunc); ++iter) {
|
|
||||||
llvm::Instruction *inst = &*iter;
|
|
||||||
llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
|
|
||||||
if (ci != NULL) {
|
|
||||||
llvm::Function *nextCalledFunc = ci->getCalledFunction();
|
|
||||||
// Don't repeatedly test functions we've seen before
|
|
||||||
if (seenFuncs.find(nextCalledFunc) == seenFuncs.end()) {
|
|
||||||
seenFuncs.insert(nextCalledFunc);
|
|
||||||
if (lMayBeRecursiveCall(nextCalledFunc, initialFunc,
|
|
||||||
seenFuncs))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -2383,47 +2399,18 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// We sometimes need to check to see if the mask is all off here;
|
|
||||||
// specifically, if the mask is all off and we call a recursive
|
|
||||||
// function, then we will probably have an unsesirable infinite loop.
|
|
||||||
ctx->SetDebugPos(pos);
|
|
||||||
llvm::BasicBlock *bDoCall = ctx->CreateBasicBlock("funcall_mask_ok");
|
|
||||||
llvm::BasicBlock *bSkip = ctx->CreateBasicBlock("funcall_mask_off");
|
|
||||||
llvm::BasicBlock *bAfter = ctx->CreateBasicBlock("after_funcall");
|
|
||||||
llvm::Function *currentFunc = ctx->GetCurrentBasicBlock()->getParent();
|
|
||||||
|
|
||||||
// If we need to check the mask (it may be a recursive call, possibly
|
|
||||||
// transitively), or we're launching a task, which is expensive and
|
|
||||||
// thus probably always worth checking, then use the mask to choose
|
|
||||||
// whether to go to the bDoCallBlock or the bSkip block
|
|
||||||
std::set<llvm::Function *> seenFuncs;
|
|
||||||
seenFuncs.insert(currentFunc);
|
|
||||||
if (ft->isTask || lMayBeRecursiveCall(callee, currentFunc, seenFuncs)) {
|
|
||||||
Debug(pos, "Checking mask before function call \"%s\".", funSym->name.c_str());
|
|
||||||
ctx->BranchIfMaskAny(bDoCall, bSkip);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
// If we don't need to check the mask, then always to the call;
|
|
||||||
// just jump to bDoCall
|
|
||||||
ctx->BranchInst(bDoCall);
|
|
||||||
|
|
||||||
// And the bSkip block just jumps immediately to bAfter. So why do we
|
|
||||||
// need it? So the phi node below can easily tell what paths are
|
|
||||||
// going into it
|
|
||||||
ctx->SetCurrentBasicBlock(bSkip);
|
|
||||||
ctx->BranchInst(bAfter);
|
|
||||||
|
|
||||||
// Emit the code to do the function call
|
|
||||||
ctx->SetCurrentBasicBlock(bDoCall);
|
|
||||||
|
|
||||||
llvm::Value *retVal = NULL;
|
llvm::Value *retVal = NULL;
|
||||||
ctx->SetDebugPos(pos);
|
ctx->SetDebugPos(pos);
|
||||||
if (ft->isTask)
|
if (ft->isTask) {
|
||||||
ctx->LaunchInst(callee, argVals);
|
assert(launchCountExpr != NULL);
|
||||||
|
llvm::Value *launchCount = launchCountExpr->GetValue(ctx);
|
||||||
|
if (launchCount != NULL)
|
||||||
|
ctx->LaunchInst(callee, argVals, launchCount);
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
// Most of the time, the mask is passed as the last argument. this
|
// Most of the time, the mask is passed as the last argument. this
|
||||||
// isn't the case for things like SSE intrinsics and extern "C"
|
// isn't the case for things like intrinsics, builtins, and extern
|
||||||
// functions from the application.
|
// "C" functions from the application.
|
||||||
assert(callargs.size() + 1 == callee->arg_size() ||
|
assert(callargs.size() + 1 == callee->arg_size() ||
|
||||||
callargs.size() == callee->arg_size());
|
callargs.size() == callee->arg_size());
|
||||||
|
|
||||||
@@ -2450,22 +2437,10 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// And jump out to the 'after funciton call' basic block
|
|
||||||
ctx->BranchInst(bAfter);
|
|
||||||
ctx->SetCurrentBasicBlock(bAfter);
|
|
||||||
|
|
||||||
if (isVoidFunc)
|
if (isVoidFunc)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
else
|
||||||
// The return value for the non-void case is either undefined or the
|
return retVal;
|
||||||
// function return value, depending on whether we actually ran the code
|
|
||||||
// path that called the function or not.
|
|
||||||
LLVM_TYPE_CONST llvm::Type *lrType = ft->GetReturnType()->LLVMType(g->ctx);
|
|
||||||
llvm::PHINode *ret = ctx->PhiNode(lrType, 2, "fun_ret");
|
|
||||||
assert(retVal != NULL);
|
|
||||||
ret->addIncoming(llvm::UndefValue::get(lrType), bSkip);
|
|
||||||
ret->addIncoming(retVal, bDoCall);
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -2507,10 +2482,21 @@ FunctionCallExpr::TypeCheck() {
|
|||||||
if (!isLaunch)
|
if (!isLaunch)
|
||||||
Error(pos, "\"launch\" expression needed to call function "
|
Error(pos, "\"launch\" expression needed to call function "
|
||||||
"with \"task\" qualifier.");
|
"with \"task\" qualifier.");
|
||||||
|
if (!launchCountExpr)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
launchCountExpr =
|
||||||
|
launchCountExpr->TypeConv(AtomicType::UniformInt32,
|
||||||
|
"task launch count");
|
||||||
|
if (!launchCountExpr)
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
else if (isLaunch)
|
else {
|
||||||
|
if (isLaunch)
|
||||||
Error(pos, "\"launch\" expression illegal with non-\"task\"-"
|
Error(pos, "\"launch\" expression illegal with non-\"task\"-"
|
||||||
"qualified function.");
|
"qualified function.");
|
||||||
|
assert(launchCountExpr == NULL);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
Error(pos, "Valid function name must be used for function call.");
|
Error(pos, "Valid function name must be used for function call.");
|
||||||
@@ -2526,6 +2512,13 @@ FunctionCallExpr::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
FunctionCallExpr::EstimateCost() const {
|
||||||
|
return ((args ? args->EstimateCost() : 0) +
|
||||||
|
(isLaunch ? COST_TASK_LAUNCH : COST_FUNCALL));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
FunctionCallExpr::Print() const {
|
FunctionCallExpr::Print() const {
|
||||||
if (!func || !args || !GetType())
|
if (!func || !args || !GetType())
|
||||||
@@ -2614,7 +2607,7 @@ ExprList::GetConstant(const Type *type) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (dynamic_cast<const StructType *>(type) != NULL) {
|
if (dynamic_cast<const StructType *>(type) != NULL) {
|
||||||
#if defined(LLVM_2_8) || defined(LLVM_2_9)
|
#if defined(LLVM_2_9)
|
||||||
return llvm::ConstantStruct::get(*g->ctx, cv, false);
|
return llvm::ConstantStruct::get(*g->ctx, cv, false);
|
||||||
#else
|
#else
|
||||||
LLVM_TYPE_CONST llvm::StructType *llvmStructType =
|
LLVM_TYPE_CONST llvm::StructType *llvmStructType =
|
||||||
@@ -2637,6 +2630,17 @@ ExprList::GetConstant(const Type *type) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
ExprList::EstimateCost() const {
|
||||||
|
int cost = 0;
|
||||||
|
for (unsigned int i = 0; i < exprs.size(); ++i) {
|
||||||
|
if (exprs[i] != NULL)
|
||||||
|
cost += exprs[i]->EstimateCost();
|
||||||
|
}
|
||||||
|
return cost;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ExprList::Print() const {
|
ExprList::Print() const {
|
||||||
printf("expr list (");
|
printf("expr list (");
|
||||||
@@ -2767,6 +2771,22 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
|
|||||||
if (!basePtr)
|
if (!basePtr)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
// If the array index is a compile time constant, check to see if it
|
||||||
|
// may lead to an out-of-bounds access.
|
||||||
|
ConstExpr *ce = dynamic_cast<ConstExpr *>(index);
|
||||||
|
const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
|
||||||
|
assert(seqType != NULL);
|
||||||
|
int nElements = seqType->GetElementCount();
|
||||||
|
if (ce != NULL && nElements > 0) {
|
||||||
|
int32_t indices[ISPC_MAX_NVEC];
|
||||||
|
int count = ce->AsInt32(indices);
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
if (indices[i] < 0 || indices[i] >= nElements)
|
||||||
|
Warning(index->pos, "Array index \"%d\" may be out of bounds for "
|
||||||
|
"\"%d\" element array.", indices[i], nElements);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
basePtr = lCastUniformVectorBasePtr(basePtr, ctx);
|
basePtr = lCastUniformVectorBasePtr(basePtr, ctx);
|
||||||
|
|
||||||
ctx->SetDebugPos(pos);
|
ctx->SetDebugPos(pos);
|
||||||
@@ -2819,6 +2839,16 @@ IndexExpr::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
IndexExpr::EstimateCost() const {
|
||||||
|
// be pessimistic
|
||||||
|
if (index && index->GetType()->IsVaryingType())
|
||||||
|
return COST_GATHER;
|
||||||
|
else
|
||||||
|
return COST_LOAD;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
IndexExpr::Print() const {
|
IndexExpr::Print() const {
|
||||||
if (!arrayOrVector || !index || !GetType())
|
if (!arrayOrVector || !index || !GetType())
|
||||||
@@ -3118,6 +3148,7 @@ MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos) {
|
|||||||
return new MemberExpr(e, id, p, idpos);
|
return new MemberExpr(e, id, p, idpos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos)
|
MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos)
|
||||||
: Expr(p), identifierPos(idpos) {
|
: Expr(p), identifierPos(idpos) {
|
||||||
expr = e;
|
expr = e;
|
||||||
@@ -3214,6 +3245,14 @@ MemberExpr::Optimize() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
MemberExpr::EstimateCost() const {
|
||||||
|
// FIXME: return gather cost when we can tell a gather is going to be
|
||||||
|
// needed
|
||||||
|
return COST_SIMPLE_ARITH_LOGIC_OP;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
MemberExpr::Print() const {
|
MemberExpr::Print() const {
|
||||||
if (!expr || !GetType())
|
if (!expr || !GetType())
|
||||||
@@ -3281,7 +3320,7 @@ ConstExpr::ConstExpr(const Type *t, uint8_t u, SourcePos p)
|
|||||||
: Expr(p) {
|
: Expr(p) {
|
||||||
type = t;
|
type = t;
|
||||||
type = type->GetAsConstType();
|
type = type->GetAsConstType();
|
||||||
assert(type == AtomicType::UniformUInt8);
|
assert(type == AtomicType::UniformConstUInt8);
|
||||||
uint8Val[0] = u;
|
uint8Val[0] = u;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3321,7 +3360,7 @@ ConstExpr::ConstExpr(const Type *t, uint16_t u, SourcePos p)
|
|||||||
: Expr(p) {
|
: Expr(p) {
|
||||||
type = t;
|
type = t;
|
||||||
type = type->GetAsConstType();
|
type = type->GetAsConstType();
|
||||||
assert(type == AtomicType::UniformUInt16);
|
assert(type == AtomicType::UniformConstUInt16);
|
||||||
uint16Val[0] = u;
|
uint16Val[0] = u;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3424,7 +3463,7 @@ ConstExpr::ConstExpr(const Type *t, uint64_t u, SourcePos p)
|
|||||||
: Expr(p) {
|
: Expr(p) {
|
||||||
type = t;
|
type = t;
|
||||||
type = type->GetAsConstType();
|
type = type->GetAsConstType();
|
||||||
assert(type == AtomicType::UniformUInt64);
|
assert(type == AtomicType::UniformConstUInt64);
|
||||||
uint64Val[0] = u;
|
uint64Val[0] = u;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4009,6 +4048,12 @@ ConstExpr::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
ConstExpr::EstimateCost() const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ConstExpr::Print() const {
|
ConstExpr::Print() const {
|
||||||
printf("[%s] (", GetType()->GetString().c_str());
|
printf("[%s] (", GetType()->GetString().c_str());
|
||||||
@@ -4095,7 +4140,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
|
|||||||
case AtomicType::TYPE_BOOL:
|
case AtomicType::TYPE_BOOL:
|
||||||
if (fromType->IsVaryingType() &&
|
if (fromType->IsVaryingType() &&
|
||||||
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
|
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
|
||||||
// If we have a bool vector of i32 element,s first truncate
|
// If we have a bool vector of i32 elements, first truncate
|
||||||
// down to a single bit
|
// down to a single bit
|
||||||
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
|
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
|
||||||
// And then do an unisgned int->float cast
|
// And then do an unisgned int->float cast
|
||||||
@@ -4155,9 +4200,6 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
|
|||||||
case AtomicType::TYPE_UINT16:
|
case AtomicType::TYPE_UINT16:
|
||||||
case AtomicType::TYPE_UINT32:
|
case AtomicType::TYPE_UINT32:
|
||||||
case AtomicType::TYPE_UINT64:
|
case AtomicType::TYPE_UINT64:
|
||||||
if (fromType->IsVaryingType())
|
|
||||||
PerformanceWarning(pos, "Conversion from unsigned int64 to float is slow. "
|
|
||||||
"Use \"int64\" if possible");
|
|
||||||
cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
|
cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
|
||||||
exprVal, targetType, "uint2double");
|
exprVal, targetType, "uint2double");
|
||||||
break;
|
break;
|
||||||
@@ -4929,6 +4971,13 @@ TypeCastExpr::Optimize() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
TypeCastExpr::EstimateCost() const {
|
||||||
|
// FIXME: return COST_TYPECAST_COMPLEX when appropriate
|
||||||
|
return COST_TYPECAST_SIMPLE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
TypeCastExpr::Print() const {
|
TypeCastExpr::Print() const {
|
||||||
printf("[%s] type cast (", GetType()->GetString().c_str());
|
printf("[%s] type cast (", GetType()->GetString().c_str());
|
||||||
@@ -4994,6 +5043,12 @@ ReferenceExpr::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
ReferenceExpr::EstimateCost() const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ReferenceExpr::Print() const {
|
ReferenceExpr::Print() const {
|
||||||
if (expr == NULL || GetType() == NULL)
|
if (expr == NULL || GetType() == NULL)
|
||||||
@@ -5072,6 +5127,12 @@ DereferenceExpr::Optimize() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
DereferenceExpr::EstimateCost() const {
|
||||||
|
return COST_DEREF;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
DereferenceExpr::Print() const {
|
DereferenceExpr::Print() const {
|
||||||
if (expr == NULL || GetType() == NULL)
|
if (expr == NULL || GetType() == NULL)
|
||||||
@@ -5143,6 +5204,15 @@ SymbolExpr::Optimize() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
SymbolExpr::EstimateCost() const {
|
||||||
|
if (symbol->constValue != NULL)
|
||||||
|
return 0;
|
||||||
|
else
|
||||||
|
return COST_LOAD;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
SymbolExpr::Print() const {
|
SymbolExpr::Print() const {
|
||||||
if (symbol == NULL || GetType() == NULL)
|
if (symbol == NULL || GetType() == NULL)
|
||||||
@@ -5157,9 +5227,11 @@ SymbolExpr::Print() const {
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// FunctionSymbolExpr
|
// FunctionSymbolExpr
|
||||||
|
|
||||||
FunctionSymbolExpr::FunctionSymbolExpr(std::vector<Symbol *> *candidates,
|
FunctionSymbolExpr::FunctionSymbolExpr(const char *n,
|
||||||
|
std::vector<Symbol *> *candidates,
|
||||||
SourcePos p)
|
SourcePos p)
|
||||||
: Expr(p) {
|
: Expr(p) {
|
||||||
|
name = n;
|
||||||
matchingFunc = NULL;
|
matchingFunc = NULL;
|
||||||
candidateFunctions = candidates;
|
candidateFunctions = candidates;
|
||||||
}
|
}
|
||||||
@@ -5196,6 +5268,12 @@ FunctionSymbolExpr::Optimize() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
FunctionSymbolExpr::EstimateCost() const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
FunctionSymbolExpr::Print() const {
|
FunctionSymbolExpr::Print() const {
|
||||||
if (!matchingFunc || !GetType())
|
if (!matchingFunc || !GetType())
|
||||||
@@ -5219,14 +5297,14 @@ SyncExpr::GetType() const {
|
|||||||
llvm::Value *
|
llvm::Value *
|
||||||
SyncExpr::GetValue(FunctionEmitContext *ctx) const {
|
SyncExpr::GetValue(FunctionEmitContext *ctx) const {
|
||||||
ctx->SetDebugPos(pos);
|
ctx->SetDebugPos(pos);
|
||||||
std::vector<llvm::Value *> noArg;
|
ctx->SyncInst();
|
||||||
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
|
||||||
if (fsync == NULL) {
|
|
||||||
FATAL("Couldn't find ISPCSync declaration?!");
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ctx->CallInst(fsync, noArg, "");
|
|
||||||
|
int
|
||||||
|
SyncExpr::EstimateCost() const {
|
||||||
|
return COST_SYNC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
52
expr.h
52
expr.h
@@ -121,8 +121,8 @@ public:
|
|||||||
void Print() const;
|
void Print() const;
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
const Op op;
|
const Op op;
|
||||||
Expr *expr;
|
Expr *expr;
|
||||||
};
|
};
|
||||||
@@ -164,8 +164,8 @@ public:
|
|||||||
|
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
const Op op;
|
const Op op;
|
||||||
Expr *arg0, *arg1;
|
Expr *arg0, *arg1;
|
||||||
};
|
};
|
||||||
@@ -196,8 +196,8 @@ public:
|
|||||||
|
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
const Op op;
|
const Op op;
|
||||||
Expr *lvalue, *rvalue;
|
Expr *lvalue, *rvalue;
|
||||||
};
|
};
|
||||||
@@ -217,8 +217,8 @@ public:
|
|||||||
|
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Expr *test, *expr1, *expr2;
|
Expr *test, *expr1, *expr2;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -240,6 +240,7 @@ public:
|
|||||||
llvm::Constant *GetConstant(const Type *type) const;
|
llvm::Constant *GetConstant(const Type *type) const;
|
||||||
ExprList *Optimize();
|
ExprList *Optimize();
|
||||||
ExprList *TypeCheck();
|
ExprList *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
std::vector<Expr *> exprs;
|
std::vector<Expr *> exprs;
|
||||||
};
|
};
|
||||||
@@ -249,7 +250,8 @@ public:
|
|||||||
*/
|
*/
|
||||||
class FunctionCallExpr : public Expr {
|
class FunctionCallExpr : public Expr {
|
||||||
public:
|
public:
|
||||||
FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
|
FunctionCallExpr(Expr *func, ExprList *args, SourcePos p,
|
||||||
|
bool isLaunch = false, Expr *launchCountExpr = NULL);
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
const Type *GetType() const;
|
const Type *GetType() const;
|
||||||
@@ -257,13 +259,15 @@ public:
|
|||||||
|
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Expr *func;
|
Expr *func;
|
||||||
ExprList *args;
|
ExprList *args;
|
||||||
bool isLaunch;
|
bool isLaunch;
|
||||||
|
Expr *launchCountExpr;
|
||||||
|
|
||||||
void resolveFunctionOverloads();
|
private:
|
||||||
|
void resolveFunctionOverloads(bool exactMatchOnly);
|
||||||
bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
|
bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -285,8 +289,8 @@ public:
|
|||||||
|
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Expr *arrayOrVector, *index;
|
Expr *arrayOrVector, *index;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -303,16 +307,17 @@ public:
|
|||||||
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
||||||
SourcePos identifierPos);
|
SourcePos identifierPos);
|
||||||
|
|
||||||
virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||||
virtual const Type *GetType() const;
|
const Type *GetType() const;
|
||||||
virtual Symbol *GetBaseSymbol() const;
|
Symbol *GetBaseSymbol() const;
|
||||||
virtual void Print() const;
|
void Print() const;
|
||||||
virtual Expr *Optimize();
|
Expr *Optimize();
|
||||||
virtual Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
virtual int getElementNumber() const;
|
virtual int getElementNumber() const;
|
||||||
|
|
||||||
protected:
|
|
||||||
std::string getCandidateNearMatches() const;
|
std::string getCandidateNearMatches() const;
|
||||||
|
|
||||||
Expr *expr;
|
Expr *expr;
|
||||||
@@ -392,6 +397,7 @@ public:
|
|||||||
|
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
/** Return the ConstExpr's values as booleans, doing type conversion
|
/** Return the ConstExpr's values as booleans, doing type conversion
|
||||||
from the actual type if needed. If forceVarying is true, then type
|
from the actual type if needed. If forceVarying is true, then type
|
||||||
@@ -495,8 +501,8 @@ public:
|
|||||||
void Print() const;
|
void Print() const;
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
const Type *type;
|
const Type *type;
|
||||||
Expr *expr;
|
Expr *expr;
|
||||||
};
|
};
|
||||||
@@ -514,8 +520,8 @@ public:
|
|||||||
void Print() const;
|
void Print() const;
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Expr *expr;
|
Expr *expr;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -533,8 +539,8 @@ public:
|
|||||||
void Print() const;
|
void Print() const;
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Expr *expr;
|
Expr *expr;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -551,6 +557,7 @@ public:
|
|||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
void Print() const;
|
void Print() const;
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Symbol *symbol;
|
Symbol *symbol;
|
||||||
@@ -562,7 +569,7 @@ private:
|
|||||||
*/
|
*/
|
||||||
class FunctionSymbolExpr : public Expr {
|
class FunctionSymbolExpr : public Expr {
|
||||||
public:
|
public:
|
||||||
FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions,
|
FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
|
||||||
SourcePos pos);
|
SourcePos pos);
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
@@ -571,10 +578,14 @@ public:
|
|||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
void Print() const;
|
void Print() const;
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class FunctionCallExpr;
|
friend class FunctionCallExpr;
|
||||||
|
|
||||||
|
/** Name of the function that is being called. */
|
||||||
|
std::string name;
|
||||||
|
|
||||||
/** All of the functions with the name given in the function call;
|
/** All of the functions with the name given in the function call;
|
||||||
there may be more then one, in which case we need to resolve which
|
there may be more then one, in which case we need to resolve which
|
||||||
overload is the best match. */
|
overload is the best match. */
|
||||||
@@ -597,6 +608,7 @@ public:
|
|||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
void Print() const;
|
void Print() const;
|
||||||
|
int EstimateCost() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // ISPC_EXPR_H
|
#endif // ISPC_EXPR_H
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
|
|||||||
varying int3 vv = array[a];
|
varying int3 vv = array[a];
|
||||||
++vv.y;
|
++vv.y;
|
||||||
array[a] = vv;
|
array[a] = vv;
|
||||||
print("fin %\n", array[programIndex].y);
|
//CO print("fin %\n", array[programIndex].y);
|
||||||
ret[programIndex] = array[programIndex].y;
|
ret[programIndex] = array[programIndex].y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,14 @@
|
|||||||
static float float4(uniform float a, uniform float b, uniform float c,
|
|
||||||
uniform float d) {
|
export uniform int width() { return programCount; }
|
||||||
float ret = 0;
|
|
||||||
for (uniform int i = 0; i < programCount; i += 4) {
|
export void f_f(uniform float r[], uniform float a[]) {
|
||||||
ret = insert(ret, i + 0, a);
|
unsigned int i = (unsigned int)a[programIndex];
|
||||||
ret = insert(ret, i + 1, b);
|
r[programIndex] = max((unsigned int)2, i);
|
||||||
ret = insert(ret, i + 2, c);
|
|
||||||
ret = insert(ret, i + 3, d);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export float f_f(float a) {
|
export void result(uniform float r[]) {
|
||||||
unsigned int i = (unsigned int)a;
|
r[programIndex] = 1+programIndex;
|
||||||
return max((unsigned int)2, i);
|
r[0] = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
export float result() { return float4(2,2,3,4); }
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
|
|
||||||
export float f_f(float a) {
|
export uniform int width() { return programCount; }
|
||||||
unsigned int i = (unsigned int)a;
|
|
||||||
return max((unsigned int)10, i);
|
export void f_f(uniform float result[], uniform float aa[]) {
|
||||||
|
unsigned int i = (unsigned int)aa[programIndex];
|
||||||
|
result[programIndex] = max((unsigned int)100, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
export float result() { return 10; }
|
export void result(uniform float r[]) { r[programIndex] = 100; }
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,14 @@
|
|||||||
static float float4(uniform float a, uniform float b, uniform float c,
|
|
||||||
uniform float d) {
|
export uniform int width() { return programCount; }
|
||||||
float ret = 0;
|
|
||||||
for (uniform int i = 0; i < programCount; i += 4) {
|
export void f_f(uniform float result[], uniform float aa[]) {
|
||||||
ret = insert(ret, i + 0, a);
|
unsigned int i = (unsigned int)aa[programIndex];
|
||||||
ret = insert(ret, i + 1, b);
|
result[programIndex] = min((unsigned int)2, i);
|
||||||
ret = insert(ret, i + 2, c);
|
|
||||||
ret = insert(ret, i + 3, d);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export float f_f(float a) {
|
export void result(uniform float r[]) {
|
||||||
unsigned int i = (unsigned int)a;
|
r[programIndex] = 2;
|
||||||
return min((unsigned int)2, i);
|
r[0] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
export float result() { return float4(1,2,2,2); }
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,13 @@
|
|||||||
static float float4(uniform float a, uniform float b, uniform float c,
|
|
||||||
uniform float d) {
|
export uniform int width() { return programCount; }
|
||||||
float ret = 0;
|
|
||||||
for (uniform int i = 0; i < programCount; i += 4) {
|
export void f_f(uniform float r[], uniform float a[]) {
|
||||||
ret = insert(ret, i + 0, a);
|
unsigned int i = (unsigned int)a[programIndex];
|
||||||
ret = insert(ret, i + 1, b);
|
r[programIndex] = min((unsigned int)20, i);
|
||||||
ret = insert(ret, i + 2, c);
|
|
||||||
ret = insert(ret, i + 3, d);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export float f_f(float a) {
|
export void result(uniform float r[]) {
|
||||||
unsigned int i = (unsigned int)a;
|
r[programIndex] = 1+programIndex;
|
||||||
return min((unsigned int)20, i);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export float result() { return float4(1,2,3,4); }
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
|
|
||||||
struct Foo {
|
|
||||||
float f;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
export float foo(Foo f[], int i, uniform int j) {
|
|
||||||
Foo x = f[i];
|
|
||||||
return x.f;
|
|
||||||
}
|
|
||||||
|
|
||||||
208
ispc.cpp
208
ispc.cpp
@@ -42,14 +42,25 @@
|
|||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <direct.h>
|
#include <direct.h>
|
||||||
|
#define strcasecmp stricmp
|
||||||
#endif
|
#endif
|
||||||
#include <llvm/LLVMContext.h>
|
#include <llvm/LLVMContext.h>
|
||||||
#include <llvm/Module.h>
|
#include <llvm/Module.h>
|
||||||
#ifndef LLVM_2_8
|
|
||||||
#include <llvm/Analysis/DIBuilder.h>
|
#include <llvm/Analysis/DIBuilder.h>
|
||||||
#endif
|
|
||||||
#include <llvm/Analysis/DebugInfo.h>
|
#include <llvm/Analysis/DebugInfo.h>
|
||||||
#include <llvm/Support/Dwarf.h>
|
#include <llvm/Support/Dwarf.h>
|
||||||
|
#include <llvm/Target/TargetMachine.h>
|
||||||
|
#include <llvm/Target/TargetOptions.h>
|
||||||
|
#include <llvm/Target/TargetData.h>
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
#include <llvm/Support/TargetRegistry.h>
|
||||||
|
#include <llvm/Support/TargetSelect.h>
|
||||||
|
#else
|
||||||
|
#include <llvm/Target/TargetRegistry.h>
|
||||||
|
#include <llvm/Target/TargetSelect.h>
|
||||||
|
#include <llvm/Target/SubtargetFeature.h>
|
||||||
|
#endif
|
||||||
|
#include <llvm/Support/Host.h>
|
||||||
|
|
||||||
Globals *g;
|
Globals *g;
|
||||||
Module *m;
|
Module *m;
|
||||||
@@ -57,21 +68,198 @@ Module *m;
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Target
|
// Target
|
||||||
|
|
||||||
Target::Target() {
|
bool
|
||||||
|
Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||||
|
bool pic, Target *t) {
|
||||||
|
if (cpu == NULL) {
|
||||||
|
std::string hostCPU = llvm::sys::getHostCPUName();
|
||||||
|
if (hostCPU.size() > 0)
|
||||||
|
cpu = hostCPU.c_str();
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "Warning: unable to determine host CPU!\n");
|
||||||
|
cpu = "generic";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
t->cpu = cpu;
|
||||||
|
|
||||||
|
if (isa == NULL) {
|
||||||
|
if (!strcasecmp(cpu, "atom"))
|
||||||
|
isa = "sse2";
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||||
|
else if (!strcasecmp(cpu, "sandybridge") ||
|
||||||
|
!strcasecmp(cpu, "corei7-avx"))
|
||||||
|
isa = "avx";
|
||||||
|
#endif // LLVM_3_0
|
||||||
|
else
|
||||||
|
isa = "sse4";
|
||||||
|
}
|
||||||
|
if (arch == NULL)
|
||||||
arch = "x86-64";
|
arch = "x86-64";
|
||||||
cpu = "nehalem";
|
|
||||||
is32bit = false;
|
bool error = false;
|
||||||
isa = SSE4;
|
|
||||||
nativeVectorWidth = 4;
|
t->generatePIC = pic;
|
||||||
vectorWidth = 4;
|
|
||||||
|
// Make sure the target architecture is a known one; print an error
|
||||||
|
// with the valid ones otherwise.
|
||||||
|
t->target = NULL;
|
||||||
|
for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
|
||||||
|
iter != llvm::TargetRegistry::end(); ++iter) {
|
||||||
|
if (std::string(arch) == iter->getName()) {
|
||||||
|
t->target = &*iter;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (t->target == NULL) {
|
||||||
|
fprintf(stderr, "Invalid architecture \"%s\"\nOptions: ", arch);
|
||||||
|
llvm::TargetRegistry::iterator iter;
|
||||||
|
for (iter = llvm::TargetRegistry::begin();
|
||||||
|
iter != llvm::TargetRegistry::end(); ++iter)
|
||||||
|
fprintf(stderr, "%s ", iter->getName());
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
error = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
t->arch = arch;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!strcasecmp(isa, "sse2")) {
|
||||||
|
t->isa = Target::SSE2;
|
||||||
|
t->nativeVectorWidth = 4;
|
||||||
|
t->vectorWidth = 4;
|
||||||
|
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
||||||
|
}
|
||||||
|
else if (!strcasecmp(isa, "sse4")) {
|
||||||
|
t->isa = Target::SSE4;
|
||||||
|
t->nativeVectorWidth = 4;
|
||||||
|
t->vectorWidth = 4;
|
||||||
|
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||||
|
}
|
||||||
|
else if (!strcasecmp(isa, "sse4x2")) {
|
||||||
|
t->isa = Target::SSE4;
|
||||||
|
t->nativeVectorWidth = 4;
|
||||||
|
t->vectorWidth = 8;
|
||||||
|
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||||
|
}
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
else if (!strcasecmp(isa, "avx")) {
|
||||||
|
t->isa = Target::AVX;
|
||||||
|
t->nativeVectorWidth = 8;
|
||||||
|
t->vectorWidth = 8;
|
||||||
|
t->attributes = "+avx,+popcnt,+cmov";
|
||||||
|
}
|
||||||
|
else if (!strcasecmp(isa, "avx-x2")) {
|
||||||
|
t->isa = Target::AVX;
|
||||||
|
t->nativeVectorWidth = 8;
|
||||||
|
t->vectorWidth = 16;
|
||||||
|
t->attributes = "+avx,+popcnt,+cmov";
|
||||||
|
}
|
||||||
|
#endif // LLVM 3.0
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n",
|
||||||
|
isa, SupportedTargetISAs());
|
||||||
|
error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!error) {
|
||||||
|
llvm::TargetMachine *targetMachine = t->GetTargetMachine();
|
||||||
|
const llvm::TargetData *targetData = targetMachine->getTargetData();
|
||||||
|
t->is32bit = (targetData->getPointerSize() == 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
return !error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const char *
|
||||||
|
Target::SupportedTargetCPUs() {
|
||||||
|
return "atom, barcelona, core2, corei7, "
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||||
|
"corei7-avx, "
|
||||||
|
#endif
|
||||||
|
"istanbul, nocona, penryn, "
|
||||||
|
#ifdef LLVM_2_9
|
||||||
|
"sandybridge, "
|
||||||
|
#endif
|
||||||
|
"westmere";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const char *
|
||||||
|
Target::SupportedTargetArchs() {
|
||||||
|
return "x86, x86-64";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const char *
|
||||||
|
Target::SupportedTargetISAs() {
|
||||||
|
return "sse2, sse4, sse4x2"
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||||
|
", avx, avx-x2"
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::string
|
||||||
|
Target::GetTripleString() const {
|
||||||
|
llvm::Triple triple;
|
||||||
|
// Start with the host triple as the default
|
||||||
|
triple.setTriple(llvm::sys::getHostTriple());
|
||||||
|
|
||||||
|
// And override the arch in the host triple based on what the user
|
||||||
|
// specified. Here we need to deal with the fact that LLVM uses one
|
||||||
|
// naming convention for targets TargetRegistry, but wants some
|
||||||
|
// slightly different ones for the triple. TODO: is there a way to
|
||||||
|
// have it do this remapping, which would presumably be a bit less
|
||||||
|
// error prone?
|
||||||
|
if (arch == "x86")
|
||||||
|
triple.setArchName("i386");
|
||||||
|
else if (arch == "x86-64")
|
||||||
|
triple.setArchName("x86_64");
|
||||||
|
else
|
||||||
|
triple.setArchName(arch);
|
||||||
|
|
||||||
|
return triple.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
llvm::TargetMachine *
|
||||||
|
Target::GetTargetMachine() const {
|
||||||
|
std::string triple = GetTripleString();
|
||||||
|
|
||||||
|
llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ :
|
||||||
|
llvm::Reloc::Default;
|
||||||
|
#if defined(LLVM_3_0svn) || defined(LLVM_3_0)
|
||||||
|
std::string featuresString = attributes;
|
||||||
|
llvm::TargetMachine *targetMachine =
|
||||||
|
target->createTargetMachine(triple, cpu, featuresString, relocModel);
|
||||||
|
#else
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
relocModel = llvm::Reloc::PIC_;
|
||||||
|
#endif // ISPC_IS_APPLE
|
||||||
|
std::string featuresString = cpu + std::string(",") + attributes;
|
||||||
|
llvm::TargetMachine *targetMachine =
|
||||||
|
target->createTargetMachine(triple, featuresString);
|
||||||
|
#ifndef ISPC_IS_WINDOWS
|
||||||
|
targetMachine->setRelocationModel(relocModel);
|
||||||
|
#endif // !ISPC_IS_WINDOWS
|
||||||
|
#endif
|
||||||
|
assert(targetMachine != NULL);
|
||||||
|
|
||||||
|
targetMachine->setAsmVerbosityDefault(true);
|
||||||
|
return targetMachine;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Opt
|
// Opt
|
||||||
|
|
||||||
Opt::Opt() {
|
Opt::Opt() {
|
||||||
level = 1;
|
level = 1;
|
||||||
fastMath = false;
|
fastMath = false;
|
||||||
|
fastMaskedVload = false;
|
||||||
|
unrollLoops = true;
|
||||||
disableBlendedMaskedStores = false;
|
disableBlendedMaskedStores = false;
|
||||||
disableCoherentControlFlow = false;
|
disableCoherentControlFlow = false;
|
||||||
disableUniformControlFlow = false;
|
disableUniformControlFlow = false;
|
||||||
@@ -121,13 +309,9 @@ SourcePos::SourcePos(const char *n, int l, int c) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llvm::DIFile SourcePos::GetDIFile() const {
|
llvm::DIFile SourcePos::GetDIFile() const {
|
||||||
#ifdef LLVM_2_8
|
|
||||||
return llvm::DIFile();
|
|
||||||
#else
|
|
||||||
std::string directory, filename;
|
std::string directory, filename;
|
||||||
GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
|
GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
|
||||||
return m->diBuilder->createFile(filename, directory);
|
return m->diBuilder->createFile(filename, directory);
|
||||||
#endif // LLVM_2_8
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
73
ispc.h
73
ispc.h
@@ -69,6 +69,8 @@ namespace llvm {
|
|||||||
class FunctionType;
|
class FunctionType;
|
||||||
class LLVMContext;
|
class LLVMContext;
|
||||||
class Module;
|
class Module;
|
||||||
|
class Target;
|
||||||
|
class TargetMachine;
|
||||||
class Type;
|
class Type;
|
||||||
class Value;
|
class Value;
|
||||||
}
|
}
|
||||||
@@ -146,6 +148,8 @@ public:
|
|||||||
pointer in place of the original ASTNode *. */
|
pointer in place of the original ASTNode *. */
|
||||||
virtual ASTNode *TypeCheck() = 0;
|
virtual ASTNode *TypeCheck() = 0;
|
||||||
|
|
||||||
|
virtual int EstimateCost() const = 0;
|
||||||
|
|
||||||
/** All AST nodes must track the file position where they are
|
/** All AST nodes must track the file position where they are
|
||||||
defined. */
|
defined. */
|
||||||
const SourcePos pos;
|
const SourcePos pos;
|
||||||
@@ -156,7 +160,34 @@ public:
|
|||||||
This structure defines a compilation target for the ispc compiler.
|
This structure defines a compilation target for the ispc compiler.
|
||||||
*/
|
*/
|
||||||
struct Target {
|
struct Target {
|
||||||
Target();
|
/** Initializes the given Target pointer for a target of the given
|
||||||
|
name, if the name is a known target. Returns true if the
|
||||||
|
target was initialized and false if the name is unknown. */
|
||||||
|
static bool GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||||
|
bool pic, Target *);
|
||||||
|
|
||||||
|
/** Returns a comma-delimited string giving the names of the currently
|
||||||
|
supported target ISAs. */
|
||||||
|
static const char *SupportedTargetISAs();
|
||||||
|
|
||||||
|
/** Returns a comma-delimited string giving the names of the currently
|
||||||
|
supported target CPUs. */
|
||||||
|
static const char *SupportedTargetCPUs();
|
||||||
|
|
||||||
|
/** Returns a comma-delimited string giving the names of the currently
|
||||||
|
supported target architectures. */
|
||||||
|
static const char *SupportedTargetArchs();
|
||||||
|
|
||||||
|
/** Returns a triple string specifying the target architecture, vendor,
|
||||||
|
and environment. */
|
||||||
|
std::string GetTripleString() const;
|
||||||
|
|
||||||
|
/** Returns the LLVM TargetMachine object corresponding to this
|
||||||
|
target. */
|
||||||
|
llvm::TargetMachine *GetTargetMachine() const;
|
||||||
|
|
||||||
|
/** llvm Target object representing this target. */
|
||||||
|
const llvm::Target *target;
|
||||||
|
|
||||||
/** Enumerator giving the instruction sets that the compiler can
|
/** Enumerator giving the instruction sets that the compiler can
|
||||||
target. */
|
target. */
|
||||||
@@ -174,6 +205,9 @@ struct Target {
|
|||||||
/** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
|
/** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
|
||||||
std::string cpu;
|
std::string cpu;
|
||||||
|
|
||||||
|
/** Target-specific attributes to pass along to the LLVM backend */
|
||||||
|
std::string attributes;
|
||||||
|
|
||||||
/** Native vector width of the vector instruction set. Note that this
|
/** Native vector width of the vector instruction set. Note that this
|
||||||
value is directly derived from the ISA Being used (e.g. it's 4 for
|
value is directly derived from the ISA Being used (e.g. it's 4 for
|
||||||
SSE, 8 for AVX, etc.) */
|
SSE, 8 for AVX, etc.) */
|
||||||
@@ -183,8 +217,12 @@ struct Target {
|
|||||||
integer multiple of the native vector width, for example if we're
|
integer multiple of the native vector width, for example if we're
|
||||||
"doubling up" and compiling 8-wide on a 4-wide SSE system. */
|
"doubling up" and compiling 8-wide on a 4-wide SSE system. */
|
||||||
int vectorWidth;
|
int vectorWidth;
|
||||||
|
|
||||||
|
/** Indicates whether position independent code should be generated. */
|
||||||
|
bool generatePIC;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** @brief Structure that collects optimization options
|
/** @brief Structure that collects optimization options
|
||||||
|
|
||||||
This structure collects all of the options related to optimization of
|
This structure collects all of the options related to optimization of
|
||||||
@@ -202,6 +240,16 @@ struct Opt {
|
|||||||
should be performed. This is false by default. */
|
should be performed. This is false by default. */
|
||||||
bool fastMath;
|
bool fastMath;
|
||||||
|
|
||||||
|
/** Indicates whether an vector load should be issued for masked loads
|
||||||
|
on platforms that don't have a native masked vector load. (This may
|
||||||
|
lead to accessing memory up to programCount-1 elements past the end of
|
||||||
|
arrays, so is unsafe in general.) */
|
||||||
|
bool fastMaskedVload;
|
||||||
|
|
||||||
|
/** Indicates when loops should be unrolled (when doing so seems like
|
||||||
|
it will make sense. */
|
||||||
|
bool unrollLoops;
|
||||||
|
|
||||||
/** On targets that don't have a masked store instruction but do have a
|
/** On targets that don't have a masked store instruction but do have a
|
||||||
blending instruction, by default, we simulate masked stores by
|
blending instruction, by default, we simulate masked stores by
|
||||||
loading the old value, blending, and storing the result. This can
|
loading the old value, blending, and storing the result. This can
|
||||||
@@ -319,6 +367,29 @@ struct Globals {
|
|||||||
std::vector<std::string> cppArgs;
|
std::vector<std::string> cppArgs;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum {
|
||||||
|
COST_ASSIGN = 1,
|
||||||
|
COST_COHERENT_BREAK_CONTINE = 4,
|
||||||
|
COST_COMPLEX_ARITH_OP = 4,
|
||||||
|
COST_DEREF = 4,
|
||||||
|
COST_FUNCALL = 4,
|
||||||
|
COST_GATHER = 8,
|
||||||
|
COST_LOAD = 2,
|
||||||
|
COST_REGULAR_BREAK_CONTINUE = 2,
|
||||||
|
COST_RETURN = 4,
|
||||||
|
COST_SELECT = 4,
|
||||||
|
COST_SIMPLE_ARITH_LOGIC_OP = 1,
|
||||||
|
COST_SYNC = 32,
|
||||||
|
COST_TASK_LAUNCH = 16,
|
||||||
|
COST_TYPECAST_COMPLEX = 4,
|
||||||
|
COST_TYPECAST_SIMPLE = 1,
|
||||||
|
COST_UNIFORM_LOOP = 4,
|
||||||
|
COST_VARYING_LOOP = 6,
|
||||||
|
|
||||||
|
CHECK_MASK_AT_FUNCTION_START_COST = 16,
|
||||||
|
PREDICATE_SAFE_IF_STATEMENT_COST = 6,
|
||||||
|
};
|
||||||
|
|
||||||
extern Globals *g;
|
extern Globals *g;
|
||||||
extern Module *m;
|
extern Module *m;
|
||||||
|
|
||||||
|
|||||||
38
ispc.vcxproj
38
ispc.vcxproj
@@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup Label="ProjectConfigurations">
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
<ProjectConfiguration Include="Debug|Win32">
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
@@ -16,6 +16,7 @@
|
|||||||
<ClCompile Include="decl.cpp" />
|
<ClCompile Include="decl.cpp" />
|
||||||
<ClCompile Include="expr.cpp" />
|
<ClCompile Include="expr.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-avx.cpp" />
|
<ClCompile Include="gen-bitcode-avx.cpp" />
|
||||||
|
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-sse2.cpp" />
|
<ClCompile Include="gen-bitcode-sse2.cpp" />
|
||||||
@@ -30,12 +31,14 @@
|
|||||||
<ClCompile Include="opt.cpp" />
|
<ClCompile Include="opt.cpp" />
|
||||||
<ClCompile Include="parse.cc" />
|
<ClCompile Include="parse.cc" />
|
||||||
<CustomBuild Include="builtins-c.c">
|
<CustomBuild Include="builtins-c.c">
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
||||||
|
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
||||||
|
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
<ClCompile Include="stmt.cpp" />
|
<ClCompile Include="stmt.cpp" />
|
||||||
<ClCompile Include="sym.cpp" />
|
<ClCompile Include="sym.cpp" />
|
||||||
@@ -60,9 +63,9 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="stdlib.ispc">
|
<CustomBuild Include="stdlib.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
|
||||||
@@ -120,6 +123,19 @@
|
|||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CustomBuild Include="builtins-avx-x2.ll">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll > gen-bitcode-avx-x2.cpp</Command>
|
||||||
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
|
||||||
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll > gen-bitcode-avx-x2.cpp</Command>
|
||||||
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
|
||||||
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
|
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
|
||||||
|
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
|
||||||
|
</CustomBuild>
|
||||||
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="lex.ll">
|
<CustomBuild Include="lex.ll">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
@@ -180,7 +196,7 @@
|
|||||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
|
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||||
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -188,7 +204,7 @@
|
|||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
</Link>
|
</Link>
|
||||||
</ItemDefinitionGroup>
|
</ItemDefinitionGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
@@ -198,7 +214,7 @@
|
|||||||
<Optimization>MaxSpeed</Optimization>
|
<Optimization>MaxSpeed</Optimization>
|
||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
|
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||||
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -208,7 +224,7 @@
|
|||||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||||
<OptimizeReferences>true</OptimizeReferences>
|
<OptimizeReferences>true</OptimizeReferences>
|
||||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
</Link>
|
</Link>
|
||||||
</ItemDefinitionGroup>
|
</ItemDefinitionGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
|||||||
129
ispc_test.cpp
129
ispc_test.cpp
@@ -33,12 +33,25 @@
|
|||||||
|
|
||||||
#define _CRT_SECURE_NO_WARNINGS
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
|
|
||||||
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#endif
|
#endif
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <memory.h>
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef ISPC_HAVE_SVML
|
#ifdef ISPC_HAVE_SVML
|
||||||
#include <xmmintrin.h>
|
#include <xmmintrin.h>
|
||||||
@@ -61,8 +74,14 @@ extern "C" {
|
|||||||
#include <llvm/DerivedTypes.h>
|
#include <llvm/DerivedTypes.h>
|
||||||
#include <llvm/Instructions.h>
|
#include <llvm/Instructions.h>
|
||||||
#include <llvm/ExecutionEngine/ExecutionEngine.h>
|
#include <llvm/ExecutionEngine/ExecutionEngine.h>
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
#include <llvm/Support/TargetRegistry.h>
|
||||||
|
#include <llvm/Support/TargetSelect.h>
|
||||||
|
#else
|
||||||
|
#include <llvm/Target/TargetRegistry.h>
|
||||||
|
#include <llvm/Target/TargetSelect.h>
|
||||||
|
#endif
|
||||||
#include <llvm/ExecutionEngine/JIT.h>
|
#include <llvm/ExecutionEngine/JIT.h>
|
||||||
#include <llvm/Target/TargetSelect.h>
|
|
||||||
#include <llvm/Target/TargetOptions.h>
|
#include <llvm/Target/TargetOptions.h>
|
||||||
#include <llvm/Target/TargetData.h>
|
#include <llvm/Target/TargetData.h>
|
||||||
#include <llvm/Transforms/Scalar.h>
|
#include <llvm/Transforms/Scalar.h>
|
||||||
@@ -74,42 +93,53 @@ extern "C" {
|
|||||||
#include <llvm/Support/raw_ostream.h>
|
#include <llvm/Support/raw_ostream.h>
|
||||||
#include <llvm/Bitcode/ReaderWriter.h>
|
#include <llvm/Bitcode/ReaderWriter.h>
|
||||||
#include <llvm/Support/MemoryBuffer.h>
|
#include <llvm/Support/MemoryBuffer.h>
|
||||||
#ifndef LLVM_2_8
|
|
||||||
#include <llvm/Support/system_error.h>
|
#include <llvm/Support/system_error.h>
|
||||||
#endif
|
|
||||||
|
bool shouldFail = false;
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
void ISPCLaunch(void *, void *);
|
void ISPCLaunch(void **, void *, void *, int32_t);
|
||||||
void ISPCSync();
|
void ISPCSync(void *);
|
||||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
void *ISPCAlloc(void **, int64_t size, int32_t alignment);
|
||||||
void ISPCFree(void *ptr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ISPCLaunch(void *func, void *data) {
|
void ISPCLaunch(void **handle, void *func, void *data, int32_t count) {
|
||||||
typedef void (*TaskFuncType)(void *, int, int);
|
*handle = (void *)0xdeadbeef;
|
||||||
|
typedef void (*TaskFuncType)(void *, int, int, int, int);
|
||||||
TaskFuncType tft = (TaskFuncType)(func);
|
TaskFuncType tft = (TaskFuncType)(func);
|
||||||
tft(data, 0, 1);
|
for (int i = 0; i < count; ++i)
|
||||||
|
tft(data, 0, 1, i, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void ISPCSync() {
|
void ISPCSync(void *) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
|
||||||
|
*handle = (void *)0xdeadbeef;
|
||||||
|
// leak time!
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
|
||||||
return _aligned_malloc(size, alignment);
|
return _aligned_malloc(size, alignment);
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void ISPCFree(void *ptr) {
|
|
||||||
_aligned_free(ptr);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
return memalign(alignment, size);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||||
|
char *amem = ((char*)mem) + sizeof(void*);
|
||||||
|
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||||
|
(alignment - 1)));
|
||||||
|
((void**)amem)[-1] = mem;
|
||||||
|
return amem;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void usage(int ret) {
|
static void usage(int ret) {
|
||||||
fprintf(stderr, "usage: ispc_test\n");
|
fprintf(stderr, "usage: ispc_test\n");
|
||||||
fprintf(stderr, "\t[-h/--help]\tprint help\n");
|
fprintf(stderr, "\t[-h/--help]\tprint help\n");
|
||||||
|
fprintf(stderr, "\t[-f]\t\tindicates that test is expected to fail\n");
|
||||||
fprintf(stderr, "\t<files>\n");
|
fprintf(stderr, "\t<files>\n");
|
||||||
exit(ret);
|
exit(ret);
|
||||||
}
|
}
|
||||||
@@ -135,17 +165,6 @@ double Log(double x) { return log(x); }
|
|||||||
static bool lRunTest(const char *fn) {
|
static bool lRunTest(const char *fn) {
|
||||||
llvm::LLVMContext *ctx = new llvm::LLVMContext;
|
llvm::LLVMContext *ctx = new llvm::LLVMContext;
|
||||||
|
|
||||||
#ifdef LLVM_2_8
|
|
||||||
std::string err;
|
|
||||||
llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
|
|
||||||
if (!buf) {
|
|
||||||
fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
|
|
||||||
delete ctx;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
std::string bcErr;
|
|
||||||
llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
|
|
||||||
#else
|
|
||||||
llvm::OwningPtr<llvm::MemoryBuffer> buf;
|
llvm::OwningPtr<llvm::MemoryBuffer> buf;
|
||||||
llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
|
llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
|
||||||
if (err) {
|
if (err) {
|
||||||
@@ -155,7 +174,6 @@ static bool lRunTest(const char *fn) {
|
|||||||
}
|
}
|
||||||
std::string bcErr;
|
std::string bcErr;
|
||||||
llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
|
llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
|
||||||
#endif
|
|
||||||
|
|
||||||
if (!module) {
|
if (!module) {
|
||||||
fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
|
fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
|
||||||
@@ -164,7 +182,21 @@ static bool lRunTest(const char *fn) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::string eeError;
|
std::string eeError;
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
llvm::EngineBuilder engineBuilder(module);
|
||||||
|
engineBuilder.setErrorStr(&eeError);
|
||||||
|
engineBuilder.setEngineKind(llvm::EngineKind::JIT);
|
||||||
|
#if 0
|
||||||
|
std::vector<std::string> attributes;
|
||||||
|
if (target != NULL && !strcmp(target, "avx"))
|
||||||
|
attributes.push_back("+avx");
|
||||||
|
engineBuilder.setMAttrs(attributes);
|
||||||
|
engineBuilder.setUseMCJIT(true);
|
||||||
|
#endif
|
||||||
|
llvm::ExecutionEngine *ee = engineBuilder.create();
|
||||||
|
#else
|
||||||
llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
|
llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
|
||||||
|
#endif
|
||||||
if (!ee) {
|
if (!ee) {
|
||||||
fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
|
fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
|
||||||
return false;
|
return false;
|
||||||
@@ -176,10 +208,7 @@ static bool lRunTest(const char *fn) {
|
|||||||
ee->addGlobalMapping(func, (void *)FUNC)
|
ee->addGlobalMapping(func, (void *)FUNC)
|
||||||
DO_FUNC(ISPCLaunch, "ISPCLaunch");
|
DO_FUNC(ISPCLaunch, "ISPCLaunch");
|
||||||
DO_FUNC(ISPCSync, "ISPCSync");
|
DO_FUNC(ISPCSync, "ISPCSync");
|
||||||
#ifdef ISPC_IS_WINDOWS
|
DO_FUNC(ISPCAlloc, "ISPCAlloc");
|
||||||
DO_FUNC(ISPCMalloc, "ISPCMalloc");
|
|
||||||
DO_FUNC(ISPCFree, "ISPCFree");
|
|
||||||
#endif // ISPC_IS_WINDOWS
|
|
||||||
DO_FUNC(putchar, "putchar");
|
DO_FUNC(putchar, "putchar");
|
||||||
DO_FUNC(printf, "printf");
|
DO_FUNC(printf, "printf");
|
||||||
DO_FUNC(fflush, "fflush");
|
DO_FUNC(fflush, "fflush");
|
||||||
@@ -246,7 +275,6 @@ static bool lRunTest(const char *fn) {
|
|||||||
float result[16];
|
float result[16];
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
result[i] = 0;
|
result[i] = 0;
|
||||||
bool ok = true;
|
|
||||||
if (foundResult) {
|
if (foundResult) {
|
||||||
typedef void (*PFN)(float *);
|
typedef void (*PFN)(float *);
|
||||||
PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
|
PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
|
||||||
@@ -303,15 +331,15 @@ static bool lRunTest(const char *fn) {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
|
fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
|
||||||
ok = false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// see if we got the right result
|
// see if we got the right result
|
||||||
if (ok) {
|
bool resultsMatch = true;
|
||||||
if (foundResult) {
|
if (foundResult) {
|
||||||
for (int i = 0; i < width; ++i)
|
for (int i = 0; i < width; ++i)
|
||||||
if (returned[i] != result[i]) {
|
if (returned[i] != result[i]) {
|
||||||
ok = false;
|
resultsMatch = false;
|
||||||
fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
|
fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
|
||||||
fn, i, returned[i], returned[i], result[i], result[i]);
|
fn, i, returned[i], returned[i], result[i], result[i]);
|
||||||
}
|
}
|
||||||
@@ -321,32 +349,31 @@ static bool lRunTest(const char *fn) {
|
|||||||
fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
|
fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
|
||||||
fn, i, returned[i], returned[i]);
|
fn, i, returned[i], returned[i]);
|
||||||
}
|
}
|
||||||
}
|
if (foundResult && shouldFail && resultsMatch)
|
||||||
|
fprintf(stderr, "Test %s unexpectedly passed\n", fn);
|
||||||
|
|
||||||
delete ee;
|
delete ee;
|
||||||
delete ctx;
|
delete ctx;
|
||||||
|
|
||||||
return ok && foundResult;
|
return foundResult && resultsMatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
llvm::InitializeNativeTarget();
|
llvm::InitializeNativeTarget();
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
LLVMLinkInJIT();
|
||||||
|
#endif
|
||||||
|
|
||||||
std::vector<const char *> files;
|
const char *filename = NULL;
|
||||||
for (int i = 1; i < argc; ++i) {
|
for (int i = 1; i < argc; ++i) {
|
||||||
if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
|
if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
|
||||||
usage(0);
|
usage(0);
|
||||||
|
if (!strcmp(argv[i], "-f"))
|
||||||
|
shouldFail = true;
|
||||||
else
|
else
|
||||||
files.push_back(argv[i]);
|
filename = argv[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
int passes = 0, fails = 0;
|
return (lRunTest(filename) == true) ? 0 : 1;
|
||||||
for (unsigned int i = 0; i < files.size(); ++i) {
|
|
||||||
if (lRunTest(files[i])) ++passes;
|
|
||||||
else ++fails;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (fails > 0)
|
|
||||||
fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails);
|
|
||||||
return fails > 0;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -52,14 +52,14 @@
|
|||||||
</PrecompiledHeader>
|
</PrecompiledHeader>
|
||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
|
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
|
||||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
</Link>
|
</Link>
|
||||||
</ItemDefinitionGroup>
|
</ItemDefinitionGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
@@ -70,7 +70,7 @@
|
|||||||
<Optimization>MaxSpeed</Optimization>
|
<Optimization>MaxSpeed</Optimization>
|
||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -79,7 +79,7 @@
|
|||||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||||
<OptimizeReferences>true</OptimizeReferences>
|
<OptimizeReferences>true</OptimizeReferences>
|
||||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
|
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
|
||||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
</Link>
|
</Link>
|
||||||
</ItemDefinitionGroup>
|
</ItemDefinitionGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
|||||||
160
main.cpp
160
main.cpp
@@ -40,10 +40,14 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <llvm/Support/PrettyStackTrace.h>
|
#include <llvm/Support/PrettyStackTrace.h>
|
||||||
#ifdef LLVM_2_8
|
|
||||||
#include <llvm/System/Signals.h>
|
|
||||||
#else
|
|
||||||
#include <llvm/Support/Signals.h>
|
#include <llvm/Support/Signals.h>
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
#include <llvm/Support/TargetRegistry.h>
|
||||||
|
#include <llvm/Support/TargetSelect.h>
|
||||||
|
#else
|
||||||
|
#include <llvm/Target/TargetRegistry.h>
|
||||||
|
#include <llvm/Target/TargetSelect.h>
|
||||||
|
#include <llvm/Target/SubtargetFeature.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
@@ -53,36 +57,36 @@
|
|||||||
#endif // ISPC_IS_WINDOWS
|
#endif // ISPC_IS_WINDOWS
|
||||||
|
|
||||||
static void usage(int ret) {
|
static void usage(int ret) {
|
||||||
printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", BUILD_DATE, BUILD_VERSION);
|
printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n",
|
||||||
|
BUILD_DATE, BUILD_VERSION);
|
||||||
printf("usage: ispc\n");
|
printf("usage: ispc\n");
|
||||||
printf(" [--arch={x86,x86-64}]\t\tSelect target architecture\n");
|
printf(" [--arch={%s}]\t\tSelect target architecture\n",
|
||||||
|
Target::SupportedTargetArchs());
|
||||||
printf(" [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
|
printf(" [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
|
||||||
printf(" (atom, barcelona, core2, corei7, corei7-avx, istanbul, nocona,\n");
|
printf(" <cpu>={%s}\n", Target::SupportedTargetCPUs());
|
||||||
printf(" penryn, westmere)\n");
|
printf(" [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
|
||||||
#ifndef ISPC_IS_WINDOWS
|
|
||||||
printf(" [-D<foo>]\t\t\t\t#define value when running preprocessor\n");
|
|
||||||
#endif
|
|
||||||
printf(" [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
|
printf(" [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
|
||||||
printf(" [--emit-asm]\t\t\tGenerate assembly language file as output\n");
|
printf(" [--emit-asm]\t\t\tGenerate assembly language file as output\n");
|
||||||
printf(" [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
|
printf(" [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
|
||||||
printf(" [--emit-obj]\t\t\tGenerate object file file as output\n");
|
printf(" [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
|
||||||
printf(" [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
|
|
||||||
printf(" [-g]\t\t\t\tGenerate debugging information\n");
|
printf(" [-g]\t\t\t\tGenerate debugging information\n");
|
||||||
printf(" [--help]\t\t\t\tPrint help\n");
|
printf(" [--help]\t\t\t\tPrint help\n");
|
||||||
printf(" [-h] <name>\t\t\t\tOutput filename for header\n");
|
printf(" [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
|
||||||
printf(" [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
|
printf(" [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
|
||||||
printf(" [--math-lib=<option>]\t\tSelect math library\n");
|
printf(" [--math-lib=<option>]\t\tSelect math library\n");
|
||||||
printf(" default\t\t\t\tUse ispc's built-in math functions\n");
|
printf(" default\t\t\t\tUse ispc's built-in math functions\n");
|
||||||
printf(" fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
|
printf(" fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
|
||||||
printf(" svml\t\t\t\tUse the Intel SVML math libraries\n");
|
printf(" svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
|
||||||
printf(" system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
|
printf(" system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
|
||||||
printf(" [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
|
printf(" [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
|
||||||
#ifndef ISPC_IS_WINDOWS
|
|
||||||
printf(" [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
|
printf(" [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
|
||||||
#endif
|
printf(" [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
|
||||||
printf(" [-o/--outfile] <name>\t\tOutput filename for bitcode (may be \"-\" for standard output)\n");
|
printf(" [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
|
||||||
printf(" [-O0/-O1]\t\t\t\tSet optimization level\n");
|
|
||||||
printf(" [--opt=<option>]\t\t\tSet optimization option\n");
|
printf(" [--opt=<option>]\t\t\tSet optimization option\n");
|
||||||
|
printf(" disable-loop-unroll\t\tDisable loop unrolling.\n");
|
||||||
|
printf(" fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
|
||||||
|
printf(" fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
|
||||||
|
#if 0
|
||||||
printf(" disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
|
printf(" disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
|
||||||
printf(" disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
|
printf(" disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
|
||||||
printf(" disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
|
printf(" disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
|
||||||
@@ -91,11 +95,11 @@ static void usage(int ret) {
|
|||||||
printf(" disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
|
printf(" disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
|
||||||
printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
|
printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
|
||||||
printf(" disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
|
printf(" disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
#endif
|
||||||
printf(" [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
|
#ifndef ISPC_IS_WINDOWS
|
||||||
#else
|
printf(" [--pic]\t\t\t\tGenerate position-independent code\n");
|
||||||
printf(" [--target={sse2,sse4,sse4x2}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
|
#endif // !ISPC_IS_WINDOWS
|
||||||
#endif // LLVM 3.0
|
printf(" [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
|
||||||
printf(" [--version]\t\t\t\tPrint ispc version\n");
|
printf(" [--version]\t\t\t\tPrint ispc version\n");
|
||||||
printf(" [--woff]\t\t\t\tDisable warnings\n");
|
printf(" [--woff]\t\t\t\tDisable warnings\n");
|
||||||
printf(" [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
|
printf(" [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
|
||||||
@@ -103,35 +107,6 @@ static void usage(int ret) {
|
|||||||
exit(ret);
|
exit(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Given a target name string, set initialize the global g->target
|
|
||||||
structure appropriately.
|
|
||||||
*/
|
|
||||||
static void lDoTarget(const char *target) {
|
|
||||||
if (!strcasecmp(target, "sse2")) {
|
|
||||||
g->target.isa = Target::SSE2;
|
|
||||||
g->target.nativeVectorWidth = 4;
|
|
||||||
g->target.vectorWidth = 4;
|
|
||||||
}
|
|
||||||
else if (!strcasecmp(target, "sse4")) {
|
|
||||||
g->target.isa = Target::SSE4;
|
|
||||||
g->target.nativeVectorWidth = 4;
|
|
||||||
g->target.vectorWidth = 4;
|
|
||||||
}
|
|
||||||
else if (!strcasecmp(target, "sse4x2")) {
|
|
||||||
g->target.isa = Target::SSE4;
|
|
||||||
g->target.nativeVectorWidth = 4;
|
|
||||||
g->target.vectorWidth = 8;
|
|
||||||
}
|
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
|
||||||
else if (!strcasecmp(target, "avx")) {
|
|
||||||
g->target.isa = Target::AVX;
|
|
||||||
g->target.nativeVectorWidth = 8;
|
|
||||||
g->target.vectorWidth = 8;
|
|
||||||
}
|
|
||||||
#endif // LLVM 3.0
|
|
||||||
else
|
|
||||||
usage(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** We take arguments from both the command line as well as from the
|
/** We take arguments from both the command line as well as from the
|
||||||
@@ -190,6 +165,16 @@ int main(int Argc, char *Argv[]) {
|
|||||||
llvm::sys::PrintStackTraceOnErrorSignal();
|
llvm::sys::PrintStackTraceOnErrorSignal();
|
||||||
llvm::PrettyStackTraceProgram X(argc, argv);
|
llvm::PrettyStackTraceProgram X(argc, argv);
|
||||||
|
|
||||||
|
// initialize available LLVM targets
|
||||||
|
LLVMInitializeX86TargetInfo();
|
||||||
|
LLVMInitializeX86Target();
|
||||||
|
LLVMInitializeX86AsmPrinter();
|
||||||
|
LLVMInitializeX86AsmParser();
|
||||||
|
LLVMInitializeX86Disassembler();
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
LLVMInitializeX86TargetMC();
|
||||||
|
#endif
|
||||||
|
|
||||||
char *file = NULL;
|
char *file = NULL;
|
||||||
const char *headerFileName = NULL;
|
const char *headerFileName = NULL;
|
||||||
const char *outFileName = NULL;
|
const char *outFileName = NULL;
|
||||||
@@ -198,28 +183,29 @@ int main(int Argc, char *Argv[]) {
|
|||||||
// as we're parsing below
|
// as we're parsing below
|
||||||
g = new Globals;
|
g = new Globals;
|
||||||
|
|
||||||
bool debugSet = false, optSet = false, targetSet = false;
|
bool debugSet = false, optSet = false;
|
||||||
Module::OutputType ot = Module::Object;
|
Module::OutputType ot = Module::Object;
|
||||||
|
bool generatePIC = false;
|
||||||
|
const char *arch = NULL, *cpu = NULL, *target = NULL;
|
||||||
|
|
||||||
for (int i = 1; i < argc; ++i) {
|
for (int i = 1; i < argc; ++i) {
|
||||||
if (!strcmp(argv[i], "--help"))
|
if (!strcmp(argv[i], "--help"))
|
||||||
usage(0);
|
usage(0);
|
||||||
#ifndef ISPC_IS_WINDOWS
|
else if (!strncmp(argv[i], "-D", 2))
|
||||||
else if (!strncmp(argv[i], "-D", 2)) {
|
|
||||||
g->cppArgs.push_back(argv[i]);
|
g->cppArgs.push_back(argv[i]);
|
||||||
}
|
else if (!strncmp(argv[i], "--arch=", 7))
|
||||||
#endif // !ISPC_IS_WINDOWS
|
arch = argv[i] + 7;
|
||||||
else if (!strncmp(argv[i], "--arch=", 7)) {
|
|
||||||
g->target.arch = argv[i] + 7;
|
|
||||||
if (g->target.arch == "x86")
|
|
||||||
g->target.is32bit = true;
|
|
||||||
else if (g->target.arch == "x86-64")
|
|
||||||
g->target.is32bit = false;
|
|
||||||
}
|
|
||||||
else if (!strncmp(argv[i], "--cpu=", 6))
|
else if (!strncmp(argv[i], "--cpu=", 6))
|
||||||
g->target.cpu = argv[i] + 6;
|
cpu = argv[i] + 6;
|
||||||
else if (!strcmp(argv[i], "--fast-math"))
|
else if (!strcmp(argv[i], "--fast-math")) {
|
||||||
g->opt.fastMath = true;
|
fprintf(stderr, "--fast-math option has been renamed to --opt=fast-math!\n");
|
||||||
|
usage(1);
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--fast-masked-vload")) {
|
||||||
|
fprintf(stderr, "--fast-masked-vload option has been renamed to "
|
||||||
|
"--opt=fast-masked-vload!\n");
|
||||||
|
usage(1);
|
||||||
|
}
|
||||||
else if (!strcmp(argv[i], "--debug"))
|
else if (!strcmp(argv[i], "--debug"))
|
||||||
g->debugPrint = true;
|
g->debugPrint = true;
|
||||||
else if (!strcmp(argv[i], "--instrument"))
|
else if (!strcmp(argv[i], "--instrument"))
|
||||||
@@ -235,14 +221,12 @@ int main(int Argc, char *Argv[]) {
|
|||||||
else if (!strcmp(argv[i], "--emit-obj"))
|
else if (!strcmp(argv[i], "--emit-obj"))
|
||||||
ot = Module::Object;
|
ot = Module::Object;
|
||||||
else if (!strcmp(argv[i], "--target")) {
|
else if (!strcmp(argv[i], "--target")) {
|
||||||
|
// FIXME: should remove this way of specifying the target...
|
||||||
if (++i == argc) usage(1);
|
if (++i == argc) usage(1);
|
||||||
lDoTarget(argv[i]);
|
target = argv[i];
|
||||||
targetSet = true;
|
|
||||||
}
|
|
||||||
else if (!strncmp(argv[i], "--target=", 9)) {
|
|
||||||
const char *target = argv[i] + 9;
|
|
||||||
lDoTarget(target);
|
|
||||||
}
|
}
|
||||||
|
else if (!strncmp(argv[i], "--target=", 9))
|
||||||
|
target = argv[i] + 9;
|
||||||
else if (!strncmp(argv[i], "--math-lib=", 11)) {
|
else if (!strncmp(argv[i], "--math-lib=", 11)) {
|
||||||
const char *lib = argv[i] + 11;
|
const char *lib = argv[i] + 11;
|
||||||
if (!strcmp(lib, "default"))
|
if (!strcmp(lib, "default"))
|
||||||
@@ -258,7 +242,16 @@ int main(int Argc, char *Argv[]) {
|
|||||||
}
|
}
|
||||||
else if (!strncmp(argv[i], "--opt=", 6)) {
|
else if (!strncmp(argv[i], "--opt=", 6)) {
|
||||||
const char *opt = argv[i] + 6;
|
const char *opt = argv[i] + 6;
|
||||||
if (!strcmp(opt, "disable-blended-masked-stores"))
|
if (!strcmp(opt, "fast-math"))
|
||||||
|
g->opt.fastMath = true;
|
||||||
|
else if (!strcmp(opt, "fast-masked-vload"))
|
||||||
|
g->opt.fastMaskedVload = true;
|
||||||
|
else if (!strcmp(opt, "disable-loop-unroll"))
|
||||||
|
g->opt.unrollLoops = false;
|
||||||
|
|
||||||
|
// These are only used for performance tests of specific
|
||||||
|
// optimizations
|
||||||
|
else if (!strcmp(opt, "disable-blended-masked-stores"))
|
||||||
g->opt.disableBlendedMaskedStores = true;
|
g->opt.disableBlendedMaskedStores = true;
|
||||||
else if (!strcmp(opt, "disable-coherent-control-flow"))
|
else if (!strcmp(opt, "disable-coherent-control-flow"))
|
||||||
g->opt.disableCoherentControlFlow = true;
|
g->opt.disableCoherentControlFlow = true;
|
||||||
@@ -283,14 +276,19 @@ int main(int Argc, char *Argv[]) {
|
|||||||
}
|
}
|
||||||
else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
|
else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
|
||||||
g->emitPerfWarnings = false;
|
g->emitPerfWarnings = false;
|
||||||
else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--outfile")) {
|
else if (!strcmp(argv[i], "-o")) {
|
||||||
if (++i == argc) usage(1);
|
if (++i == argc) usage(1);
|
||||||
outFileName = argv[i];
|
outFileName = argv[i];
|
||||||
}
|
}
|
||||||
else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--header-outfile")) {
|
else if (!strcmp(argv[i], "--outfile="))
|
||||||
|
outFileName = argv[i] + strlen("--outfile=");
|
||||||
|
else if (!strcmp(argv[i], "-h")) {
|
||||||
if (++i == argc) usage(1);
|
if (++i == argc) usage(1);
|
||||||
headerFileName = argv[i];
|
headerFileName = argv[i];
|
||||||
}
|
}
|
||||||
|
else if (!strcmp(argv[i], "--header-outfile=")) {
|
||||||
|
headerFileName = argv[i] + strlen("--header-outfile=");
|
||||||
|
}
|
||||||
else if (!strcmp(argv[i], "-O0")) {
|
else if (!strcmp(argv[i], "-O0")) {
|
||||||
g->opt.level = 0;
|
g->opt.level = 0;
|
||||||
optSet = true;
|
optSet = true;
|
||||||
@@ -306,6 +304,10 @@ int main(int Argc, char *Argv[]) {
|
|||||||
g->includeStdlib = false;
|
g->includeStdlib = false;
|
||||||
else if (!strcmp(argv[i], "--nocpp"))
|
else if (!strcmp(argv[i], "--nocpp"))
|
||||||
g->runCPP = false;
|
g->runCPP = false;
|
||||||
|
#ifndef ISPC_IS_WINDOWS
|
||||||
|
else if (!strcmp(argv[i], "--pic"))
|
||||||
|
generatePIC = true;
|
||||||
|
#endif // !ISPC_IS_WINDOWS
|
||||||
else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
|
else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
|
||||||
printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n",
|
printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n",
|
||||||
BUILD_DATE, BUILD_VERSION);
|
BUILD_DATE, BUILD_VERSION);
|
||||||
@@ -327,10 +329,8 @@ int main(int Argc, char *Argv[]) {
|
|||||||
if (debugSet && !optSet)
|
if (debugSet && !optSet)
|
||||||
g->opt.level = 0;
|
g->opt.level = 0;
|
||||||
|
|
||||||
// Make SSE2 the default target on atom unless the target has been set
|
if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
|
||||||
// explicitly.
|
usage(1);
|
||||||
if (!targetSet && (g->target.cpu == "atom"))
|
|
||||||
lDoTarget("sse2");
|
|
||||||
|
|
||||||
m = new Module(file);
|
m = new Module(file);
|
||||||
if (m->CompileFile() == 0) {
|
if (m->CompileFile() == 0) {
|
||||||
|
|||||||
188
module.cpp
188
module.cpp
@@ -72,23 +72,17 @@
|
|||||||
#include <llvm/Support/FormattedStream.h>
|
#include <llvm/Support/FormattedStream.h>
|
||||||
#include <llvm/Support/FileUtilities.h>
|
#include <llvm/Support/FileUtilities.h>
|
||||||
#include <llvm/Target/TargetMachine.h>
|
#include <llvm/Target/TargetMachine.h>
|
||||||
#include <llvm/Target/TargetRegistry.h>
|
|
||||||
#include <llvm/Target/TargetSelect.h>
|
|
||||||
#include <llvm/Target/TargetOptions.h>
|
#include <llvm/Target/TargetOptions.h>
|
||||||
#include <llvm/Target/TargetData.h>
|
#include <llvm/Target/TargetData.h>
|
||||||
#include <llvm/Target/SubtargetFeature.h>
|
|
||||||
#include <llvm/PassManager.h>
|
#include <llvm/PassManager.h>
|
||||||
#include <llvm/Analysis/Verifier.h>
|
#include <llvm/Analysis/Verifier.h>
|
||||||
#include <llvm/Support/CFG.h>
|
#include <llvm/Support/CFG.h>
|
||||||
#include <clang/Frontend/CompilerInstance.h>
|
#include <clang/Frontend/CompilerInstance.h>
|
||||||
|
#include <clang/Frontend/TextDiagnosticPrinter.h>
|
||||||
#include <clang/Frontend/Utils.h>
|
#include <clang/Frontend/Utils.h>
|
||||||
#include <clang/Basic/TargetInfo.h>
|
#include <clang/Basic/TargetInfo.h>
|
||||||
#ifndef LLVM_2_8
|
|
||||||
#include <llvm/Support/ToolOutputFile.h>
|
#include <llvm/Support/ToolOutputFile.h>
|
||||||
#include <llvm/Support/Host.h>
|
#include <llvm/Support/Host.h>
|
||||||
#else // !LLVM_2_8
|
|
||||||
#include <llvm/System/Host.h>
|
|
||||||
#endif // LLVM_2_8
|
|
||||||
#include <llvm/Assembly/PrintModulePass.h>
|
#include <llvm/Assembly/PrintModulePass.h>
|
||||||
#include <llvm/Support/raw_ostream.h>
|
#include <llvm/Support/raw_ostream.h>
|
||||||
#include <llvm/Bitcode/ReaderWriter.h>
|
#include <llvm/Bitcode/ReaderWriter.h>
|
||||||
@@ -107,51 +101,13 @@ Module::Module(const char *fn) {
|
|||||||
symbolTable = new SymbolTable;
|
symbolTable = new SymbolTable;
|
||||||
module = new llvm::Module(filename ? filename : "<stdin>", *g->ctx);
|
module = new llvm::Module(filename ? filename : "<stdin>", *g->ctx);
|
||||||
|
|
||||||
// initialize target in module
|
module->setTargetTriple(g->target.GetTripleString());
|
||||||
llvm::InitializeAllTargets();
|
|
||||||
|
|
||||||
llvm::Triple triple;
|
|
||||||
// Start with the host triple as the default
|
|
||||||
triple.setTriple(llvm::sys::getHostTriple());
|
|
||||||
if (g->target.arch != "") {
|
|
||||||
// If the user specified a target architecture, see if it's a known
|
|
||||||
// one; print an error with the valid ones otherwise.
|
|
||||||
const llvm::Target *target = NULL;
|
|
||||||
for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
|
|
||||||
iter != llvm::TargetRegistry::end(); ++iter) {
|
|
||||||
if (g->target.arch == iter->getName()) {
|
|
||||||
target = &*iter;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!target) {
|
|
||||||
fprintf(stderr, "Invalid target \"%s\"\nOptions: ",
|
|
||||||
g->target.arch.c_str());
|
|
||||||
llvm::TargetRegistry::iterator iter;
|
|
||||||
for (iter = llvm::TargetRegistry::begin();
|
|
||||||
iter != llvm::TargetRegistry::end(); ++iter)
|
|
||||||
fprintf(stderr, "%s ", iter->getName());
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// And override the arch in the host triple
|
|
||||||
llvm::Triple::ArchType archType =
|
|
||||||
llvm::Triple::getArchTypeForLLVMName(g->target.arch);
|
|
||||||
if (archType != llvm::Triple::UnknownArch)
|
|
||||||
triple.setArch(archType);
|
|
||||||
}
|
|
||||||
module->setTargetTriple(triple.str());
|
|
||||||
|
|
||||||
|
|
||||||
#ifndef LLVM_2_8
|
|
||||||
if (g->generateDebuggingSymbols)
|
if (g->generateDebuggingSymbols)
|
||||||
diBuilder = new llvm::DIBuilder(*module);
|
diBuilder = new llvm::DIBuilder(*module);
|
||||||
else
|
else
|
||||||
diBuilder = NULL;
|
diBuilder = NULL;
|
||||||
#endif // LLVM_2_8
|
|
||||||
|
|
||||||
#ifndef LLVM_2_8
|
|
||||||
// If we're generating debugging symbols, let the DIBuilder know that
|
// If we're generating debugging symbols, let the DIBuilder know that
|
||||||
// we're starting a new compilation unit.
|
// we're starting a new compilation unit.
|
||||||
if (diBuilder != NULL) {
|
if (diBuilder != NULL) {
|
||||||
@@ -177,7 +133,6 @@ Module::Module(const char *fn) {
|
|||||||
0 /* run time version */);
|
0 /* run time version */);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // LLVM_2_8
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -191,6 +146,9 @@ extern void yy_delete_buffer(YY_BUFFER_STATE);
|
|||||||
|
|
||||||
int
|
int
|
||||||
Module::CompileFile() {
|
Module::CompileFile() {
|
||||||
|
if (g->opt.fastMath == true)
|
||||||
|
llvm::UnsafeFPMath = true;
|
||||||
|
|
||||||
// FIXME: it'd be nice to do this in the Module constructor, but this
|
// FIXME: it'd be nice to do this in the Module constructor, but this
|
||||||
// function ends up calling into routines that expect the global
|
// function ends up calling into routines that expect the global
|
||||||
// variable 'm' to be initialized and available (which it isn't until
|
// variable 'm' to be initialized and available (which it isn't until
|
||||||
@@ -495,6 +453,10 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
|
|||||||
// declarations, typedefs, and global variables declarations /
|
// declarations, typedefs, and global variables declarations /
|
||||||
// definitions. Figure out what we've got and take care of it.
|
// definitions. Figure out what we've got and take care of it.
|
||||||
|
|
||||||
|
if (ds == NULL || decl == NULL)
|
||||||
|
// Error happened earlier during parsing
|
||||||
|
return;
|
||||||
|
|
||||||
if (decl->isFunction) {
|
if (decl->isFunction) {
|
||||||
// function declaration
|
// function declaration
|
||||||
const Type *t = decl->GetType(ds);
|
const Type *t = decl->GetType(ds);
|
||||||
@@ -595,7 +557,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
|
|||||||
decl->sym->name.c_str());
|
decl->sym->name.c_str());
|
||||||
m->symbolTable->AddVariable(decl->sym);
|
m->symbolTable->AddVariable(decl->sym);
|
||||||
|
|
||||||
#ifndef LLVM_2_8
|
|
||||||
if (diBuilder && (ds->storageClass != SC_EXTERN)) {
|
if (diBuilder && (ds->storageClass != SC_EXTERN)) {
|
||||||
llvm::DIFile file = decl->pos.GetDIFile();
|
llvm::DIFile file = decl->pos.GetDIFile();
|
||||||
diBuilder->createGlobalVariable(decl->sym->name,
|
diBuilder->createGlobalVariable(decl->sym->name,
|
||||||
@@ -605,7 +566,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
|
|||||||
(ds->storageClass == SC_STATIC),
|
(ds->storageClass == SC_STATIC),
|
||||||
decl->sym->storagePtr);
|
decl->sym->storagePtr);
|
||||||
}
|
}
|
||||||
#endif // LLVM_2_8
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -667,6 +627,8 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
|
|||||||
llvm::Value *structParamPtr = argIter++;
|
llvm::Value *structParamPtr = argIter++;
|
||||||
llvm::Value *threadIndex = argIter++;
|
llvm::Value *threadIndex = argIter++;
|
||||||
llvm::Value *threadCount = argIter++;
|
llvm::Value *threadCount = argIter++;
|
||||||
|
llvm::Value *taskIndex = argIter++;
|
||||||
|
llvm::Value *taskCount = argIter++;
|
||||||
|
|
||||||
// Copy the function parameter values from the structure into local
|
// Copy the function parameter values from the structure into local
|
||||||
// storage
|
// storage
|
||||||
@@ -694,13 +656,17 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
|
|||||||
threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
|
threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
|
||||||
ctx->StoreInst(threadCount, threadCountSym->storagePtr);
|
ctx->StoreInst(threadCount, threadCountSym->storagePtr);
|
||||||
|
|
||||||
#ifdef ISPC_IS_WINDOWS
|
// Copy taskIndex and taskCount into stack-allocated storage so
|
||||||
// On Windows, we dynamically-allocate space for the task arguments
|
// that their symbols point to something reasonable.
|
||||||
// (see FunctionEmitContext::LaunchInst().) Here is where we emit
|
Symbol *taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
|
||||||
// the code to free that memory, now that we've copied the
|
assert(taskIndexSym);
|
||||||
// parameter values out of the structure.
|
taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
|
||||||
ctx->EmitFree(structParamPtr);
|
ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
|
||||||
#endif // ISPC_IS_WINDOWS
|
|
||||||
|
Symbol *taskCountSym = m->symbolTable->LookupVariable("taskCount");
|
||||||
|
assert(taskCountSym);
|
||||||
|
taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
|
||||||
|
ctx->StoreInst(taskCount, taskCountSym->storagePtr);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Regular, non-task function
|
// Regular, non-task function
|
||||||
@@ -738,8 +704,18 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
|
|||||||
|
|
||||||
// Finally, we can generate code for the function
|
// Finally, we can generate code for the function
|
||||||
if (code != NULL) {
|
if (code != NULL) {
|
||||||
|
int costEstimate = code->EstimateCost();
|
||||||
bool checkMask = (ft->isTask == true) ||
|
bool checkMask = (ft->isTask == true) ||
|
||||||
(function->hasFnAttr(llvm::Attribute::AlwaysInline) == false);
|
((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
|
||||||
|
costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
|
||||||
|
Debug(code->pos, "Estimated cost for function \"%s\" = %d\n",
|
||||||
|
funSym->name.c_str(), costEstimate);
|
||||||
|
// If the body of the function is non-trivial, then we wrap the
|
||||||
|
// entire thing around a varying "cif (true)" test in order to reap
|
||||||
|
// the side-effect benefit of checking to see if the execution mask
|
||||||
|
// is all on and thence having a specialized code path for that
|
||||||
|
// case. If this is a simple function, then this isn't worth the
|
||||||
|
// code bloat / overhead.
|
||||||
if (checkMask) {
|
if (checkMask) {
|
||||||
bool allTrue[ISPC_MAX_NVEC];
|
bool allTrue[ISPC_MAX_NVEC];
|
||||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||||
@@ -887,6 +863,11 @@ Module::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
|
|||||||
|
|
||||||
bool
|
bool
|
||||||
Module::WriteOutput(OutputType outputType, const char *outFileName) {
|
Module::WriteOutput(OutputType outputType, const char *outFileName) {
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
if (diBuilder != NULL && outputType != Header)
|
||||||
|
diBuilder->finalize();
|
||||||
|
#endif // LLVM_3_0
|
||||||
|
|
||||||
// First, issue a warning if the output file suffix and the type of
|
// First, issue a warning if the output file suffix and the type of
|
||||||
// file being created seem to mismatch. This can help catch missing
|
// file being created seem to mismatch. This can help catch missing
|
||||||
// command-line arguments specifying the output file type.
|
// command-line arguments specifying the output file type.
|
||||||
@@ -947,12 +928,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
#ifdef LLVM_2_8
|
|
||||||
fprintf(stderr, "Direct object file emission not supported in this build.\n");
|
|
||||||
return false;
|
|
||||||
#else
|
|
||||||
return writeObjectFileOrAssembly(outputType, outFileName);
|
return writeObjectFileOrAssembly(outputType, outFileName);
|
||||||
#endif // LLVM_2_8
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -960,43 +936,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {
|
|||||||
|
|
||||||
bool
|
bool
|
||||||
Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName) {
|
Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName) {
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
|
||||||
llvm::InitializeAllTargetMCs();
|
|
||||||
#endif
|
|
||||||
llvm::InitializeAllAsmPrinters();
|
|
||||||
llvm::InitializeAllAsmParsers();
|
|
||||||
|
|
||||||
llvm::Triple triple(module->getTargetTriple());
|
|
||||||
assert(triple.getTriple().empty() == false);
|
|
||||||
|
|
||||||
const llvm::Target *target = NULL;
|
|
||||||
std::string error;
|
|
||||||
target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
|
|
||||||
assert(target != NULL);
|
|
||||||
|
|
||||||
std::string featuresString;
|
|
||||||
llvm::TargetMachine *targetMachine = NULL;
|
|
||||||
#if defined LLVM_3_0svn || defined LLVM_3_0
|
|
||||||
if (g->target.isa == Target::AVX)
|
|
||||||
featuresString = "+avx";
|
|
||||||
targetMachine = target->createTargetMachine(triple.getTriple(), g->target.cpu,
|
|
||||||
featuresString);
|
|
||||||
#else
|
|
||||||
if (g->target.cpu.size()) {
|
|
||||||
llvm::SubtargetFeatures features;
|
|
||||||
features.setCPU(g->target.cpu);
|
|
||||||
featuresString = features.getString();
|
|
||||||
}
|
|
||||||
|
|
||||||
targetMachine = target->createTargetMachine(triple.getTriple(),
|
|
||||||
featuresString);
|
|
||||||
#endif
|
|
||||||
if (targetMachine == NULL) {
|
|
||||||
fprintf(stderr, "Unable to create target machine for target \"%s\"!",
|
|
||||||
triple.str().c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
targetMachine->setAsmVerbosityDefault(true);
|
|
||||||
|
|
||||||
// Figure out if we're generating object file or assembly output, and
|
// Figure out if we're generating object file or assembly output, and
|
||||||
// set binary output for object files
|
// set binary output for object files
|
||||||
@@ -1005,6 +945,7 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
|
|||||||
bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
|
bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
|
||||||
unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
|
unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
|
||||||
|
|
||||||
|
std::string error;
|
||||||
llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
|
llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
|
||||||
if (error.size()) {
|
if (error.size()) {
|
||||||
fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
|
fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
|
||||||
@@ -1022,9 +963,8 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
|
|||||||
(g->opt.level > 0) ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None;
|
(g->opt.level > 0) ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None;
|
||||||
|
|
||||||
if (targetMachine->addPassesToEmitFile(pm, fos, fileType, optLevel)) {
|
if (targetMachine->addPassesToEmitFile(pm, fos, fileType, optLevel)) {
|
||||||
fprintf(stderr, "Fatal error adding passes to emit object file for "
|
fprintf(stderr, "Fatal error adding passes to emit object file!");
|
||||||
"target %s!\n", triple.str().c_str());
|
exit(1);
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finally, run the passes to emit the object file/assembly
|
// Finally, run the passes to emit the object file/assembly
|
||||||
@@ -1190,6 +1130,12 @@ lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
|
|||||||
for (unsigned int i = 0; i < types.size(); ++i) {
|
for (unsigned int i = 0; i < types.size(); ++i) {
|
||||||
std::string baseDecl;
|
std::string baseDecl;
|
||||||
const VectorType *vt = types[i]->GetAsNonConstType();
|
const VectorType *vt = types[i]->GetAsNonConstType();
|
||||||
|
if (!vt->IsUniformType())
|
||||||
|
// Varying stuff shouldn't be visibile to / used by the
|
||||||
|
// application, so at least make it not simple to access it by
|
||||||
|
// not declaring the type here...
|
||||||
|
continue;
|
||||||
|
|
||||||
int size = vt->GetElementCount();
|
int size = vt->GetElementCount();
|
||||||
|
|
||||||
baseDecl = vt->GetBaseType()->GetCDeclaration("");
|
baseDecl = vt->GetBaseType()->GetCDeclaration("");
|
||||||
@@ -1362,6 +1308,7 @@ Module::writeHeader(const char *fn) {
|
|||||||
default:
|
default:
|
||||||
FATAL("Unhandled target in header emission");
|
FATAL("Unhandled target in header emission");
|
||||||
}
|
}
|
||||||
|
fprintf(f, "#define ISPC_TARGET_VECTOR_WIDTH %d\n", g->target.vectorWidth);
|
||||||
|
|
||||||
fprintf(f, "#ifdef __cplusplus\nnamespace ispc {\n#endif // __cplusplus\n\n");
|
fprintf(f, "#ifdef __cplusplus\nnamespace ispc {\n#endif // __cplusplus\n\n");
|
||||||
|
|
||||||
@@ -1398,14 +1345,6 @@ Module::writeHeader(const char *fn) {
|
|||||||
lEmitEnumDecls(exportedEnumTypes, f);
|
lEmitEnumDecls(exportedEnumTypes, f);
|
||||||
lEmitStructDecls(exportedStructTypes, f);
|
lEmitStructDecls(exportedStructTypes, f);
|
||||||
|
|
||||||
// emit externs for globals
|
|
||||||
if (externGlobals.size() > 0) {
|
|
||||||
fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
|
|
||||||
fprintf(f, "// Globals declared \"extern\" from ispc code\n");
|
|
||||||
fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
|
|
||||||
lPrintExternGlobals(f, externGlobals);
|
|
||||||
}
|
|
||||||
|
|
||||||
// emit function declarations for exported stuff...
|
// emit function declarations for exported stuff...
|
||||||
if (exportedFuncs.size() > 0) {
|
if (exportedFuncs.size() > 0) {
|
||||||
fprintf(f, "\n");
|
fprintf(f, "\n");
|
||||||
@@ -1427,6 +1366,15 @@ Module::writeHeader(const char *fn) {
|
|||||||
// end namespace
|
// end namespace
|
||||||
fprintf(f, "\n#ifdef __cplusplus\n}\n#endif // __cplusplus\n");
|
fprintf(f, "\n#ifdef __cplusplus\n}\n#endif // __cplusplus\n");
|
||||||
|
|
||||||
|
// and only now emit externs for globals, outside of the ispc namespace
|
||||||
|
if (externGlobals.size() > 0) {
|
||||||
|
fprintf(f, "\n");
|
||||||
|
fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
|
||||||
|
fprintf(f, "// Globals declared \"extern\" from ispc code\n");
|
||||||
|
fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
|
||||||
|
lPrintExternGlobals(f, externGlobals);
|
||||||
|
}
|
||||||
|
|
||||||
// end guard
|
// end guard
|
||||||
fprintf(f, "\n#endif // %s\n", guard.c_str());
|
fprintf(f, "\n#endif // %s\n", guard.c_str());
|
||||||
|
|
||||||
@@ -1442,23 +1390,26 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
|
|||||||
std::string error;
|
std::string error;
|
||||||
|
|
||||||
inst.createFileManager();
|
inst.createFileManager();
|
||||||
inst.createDiagnostics(0, NULL);
|
|
||||||
clang::TargetOptions& options = inst.getTargetOpts();
|
|
||||||
|
|
||||||
|
llvm::raw_fd_ostream stderrRaw(2, false);
|
||||||
|
clang::TextDiagnosticPrinter *diagPrinter =
|
||||||
|
new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
|
||||||
|
inst.createDiagnostics(0, NULL, diagPrinter);
|
||||||
|
|
||||||
|
clang::TargetOptions &options = inst.getTargetOpts();
|
||||||
llvm::Triple triple(module->getTargetTriple());
|
llvm::Triple triple(module->getTargetTriple());
|
||||||
if (triple.getTriple().empty())
|
if (triple.getTriple().empty())
|
||||||
triple.setTriple(llvm::sys::getHostTriple());
|
triple.setTriple(llvm::sys::getHostTriple());
|
||||||
|
|
||||||
options.Triple = triple.getTriple();
|
options.Triple = triple.getTriple();
|
||||||
|
|
||||||
clang::TargetInfo* target
|
clang::TargetInfo *target =
|
||||||
= clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
|
clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
|
||||||
|
|
||||||
inst.setTarget(target);
|
inst.setTarget(target);
|
||||||
inst.createSourceManager(inst.getFileManager());
|
inst.createSourceManager(inst.getFileManager());
|
||||||
inst.InitializeSourceManager(infilename);
|
inst.InitializeSourceManager(infilename);
|
||||||
|
|
||||||
clang::PreprocessorOptions& opts = inst.getPreprocessorOpts();
|
clang::PreprocessorOptions &opts = inst.getPreprocessorOpts();
|
||||||
|
|
||||||
//Add defs for ISPC and PI
|
//Add defs for ISPC and PI
|
||||||
opts.addMacroDef("ISPC");
|
opts.addMacroDef("ISPC");
|
||||||
@@ -1471,7 +1422,10 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
inst.createPreprocessor();
|
inst.createPreprocessor();
|
||||||
|
|
||||||
|
clang::LangOptions langOptions;
|
||||||
|
diagPrinter->BeginSourceFile(langOptions, &inst.getPreprocessor());
|
||||||
clang::DoPrintPreprocessedInput(inst.getPreprocessor(),
|
clang::DoPrintPreprocessedInput(inst.getPreprocessor(),
|
||||||
ostream, inst.getPreprocessorOutputOpts());
|
ostream, inst.getPreprocessorOutputOpts());
|
||||||
|
diagPrinter->EndSourceFile();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
5
module.h
5
module.h
@@ -91,11 +91,8 @@ public:
|
|||||||
/** llvm Module object into which globals and functions are added. */
|
/** llvm Module object into which globals and functions are added. */
|
||||||
llvm::Module *module;
|
llvm::Module *module;
|
||||||
|
|
||||||
#ifndef LLVM_2_8
|
/** The diBuilder manages generating debugging information */
|
||||||
/** The diBuilder manages generating debugging information (only
|
|
||||||
supported in LLVM 2.9 and beyond...) */
|
|
||||||
llvm::DIBuilder *diBuilder;
|
llvm::DIBuilder *diBuilder;
|
||||||
#endif
|
|
||||||
|
|
||||||
GatherBuffer *gatherBuffer;
|
GatherBuffer *gatherBuffer;
|
||||||
|
|
||||||
|
|||||||
233
opt.cpp
233
opt.cpp
@@ -55,13 +55,12 @@
|
|||||||
#include <llvm/Instructions.h>
|
#include <llvm/Instructions.h>
|
||||||
#include <llvm/Intrinsics.h>
|
#include <llvm/Intrinsics.h>
|
||||||
#include <llvm/Constants.h>
|
#include <llvm/Constants.h>
|
||||||
#ifndef LLVM_2_8
|
#include <llvm/Analysis/ConstantFolding.h>
|
||||||
#include <llvm/Target/TargetLibraryInfo.h>
|
#include <llvm/Target/TargetLibraryInfo.h>
|
||||||
#ifdef LLVM_2_9
|
#ifdef LLVM_2_9
|
||||||
#include <llvm/Support/StandardPasses.h>
|
#include <llvm/Support/StandardPasses.h>
|
||||||
#else
|
#else
|
||||||
#include <llvm/Support/PassManagerBuilder.h>
|
#include <llvm/Transforms/IPO/PassManagerBuilder.h>
|
||||||
#endif // LLVM_2_9
|
|
||||||
#endif // LLVM_2_8
|
#endif // LLVM_2_8
|
||||||
#include <llvm/ADT/Triple.h>
|
#include <llvm/ADT/Triple.h>
|
||||||
#include <llvm/Transforms/Scalar.h>
|
#include <llvm/Transforms/Scalar.h>
|
||||||
@@ -69,13 +68,18 @@
|
|||||||
#include <llvm/Transforms/Utils/BasicBlockUtils.h>
|
#include <llvm/Transforms/Utils/BasicBlockUtils.h>
|
||||||
#include <llvm/Target/TargetOptions.h>
|
#include <llvm/Target/TargetOptions.h>
|
||||||
#include <llvm/Target/TargetData.h>
|
#include <llvm/Target/TargetData.h>
|
||||||
|
#include <llvm/Target/TargetMachine.h>
|
||||||
#include <llvm/Analysis/Verifier.h>
|
#include <llvm/Analysis/Verifier.h>
|
||||||
#include <llvm/Support/raw_ostream.h>
|
#include <llvm/Support/raw_ostream.h>
|
||||||
#ifndef LLVM_2_8
|
|
||||||
#include <llvm/Analysis/DIBuilder.h>
|
#include <llvm/Analysis/DIBuilder.h>
|
||||||
#endif
|
|
||||||
#include <llvm/Analysis/DebugInfo.h>
|
#include <llvm/Analysis/DebugInfo.h>
|
||||||
#include <llvm/Support/Dwarf.h>
|
#include <llvm/Support/Dwarf.h>
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <alloca.h>
|
||||||
|
#elif defined(ISPC_IS_WINDOWS)
|
||||||
|
#include <malloc.h>
|
||||||
|
#define alloca _alloca
|
||||||
|
#endif // ISPC_IS_WINDOWS
|
||||||
|
|
||||||
static llvm::Pass *CreateIntrinsicsOptPass();
|
static llvm::Pass *CreateIntrinsicsOptPass();
|
||||||
static llvm::Pass *CreateGatherScatterFlattenPass();
|
static llvm::Pass *CreateGatherScatterFlattenPass();
|
||||||
@@ -178,19 +182,22 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
llvm::PassManager optPM;
|
llvm::PassManager optPM;
|
||||||
llvm::FunctionPassManager funcPM(module);
|
llvm::FunctionPassManager funcPM(module);
|
||||||
|
|
||||||
#ifndef LLVM_2_8
|
|
||||||
llvm::TargetLibraryInfo *targetLibraryInfo =
|
llvm::TargetLibraryInfo *targetLibraryInfo =
|
||||||
new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
|
new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
|
||||||
optPM.add(targetLibraryInfo);
|
optPM.add(targetLibraryInfo);
|
||||||
#endif
|
|
||||||
optPM.add(new llvm::TargetData(module));
|
optPM.add(new llvm::TargetData(module));
|
||||||
|
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
|
optPM.add(llvm::createIndVarSimplifyPass());
|
||||||
|
#endif
|
||||||
|
|
||||||
if (optLevel == 0) {
|
if (optLevel == 0) {
|
||||||
// This is more or less the minimum set of optimizations that we
|
// This is more or less the minimum set of optimizations that we
|
||||||
// need to do to generate code that will actually run. (We can't
|
// need to do to generate code that will actually run. (We can't
|
||||||
// run absolutely no optimizations, since the front-end needs us to
|
// run absolutely no optimizations, since the front-end needs us to
|
||||||
// take the various __pseudo_* functions it has emitted and turn
|
// take the various __pseudo_* functions it has emitted and turn
|
||||||
// them into something that can actually execute.
|
// them into something that can actually execute.
|
||||||
|
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
||||||
optPM.add(CreateGatherScatterFlattenPass());
|
optPM.add(CreateGatherScatterFlattenPass());
|
||||||
optPM.add(CreateLowerGatherScatterPass());
|
optPM.add(CreateLowerGatherScatterPass());
|
||||||
optPM.add(CreateLowerMaskedStorePass());
|
optPM.add(CreateLowerMaskedStorePass());
|
||||||
@@ -211,7 +218,6 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
// only later in the optimization process as things like constant
|
// only later in the optimization process as things like constant
|
||||||
// propagation have done their thing, and then when they do kick
|
// propagation have done their thing, and then when they do kick
|
||||||
// in, they can often open up new opportunities for optimization...
|
// in, they can often open up new opportunities for optimization...
|
||||||
#ifndef LLVM_2_8
|
|
||||||
llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
|
llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
|
||||||
llvm::initializeCore(*registry);
|
llvm::initializeCore(*registry);
|
||||||
llvm::initializeScalarOpts(*registry);
|
llvm::initializeScalarOpts(*registry);
|
||||||
@@ -222,7 +228,7 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
llvm::initializeInstCombine(*registry);
|
llvm::initializeInstCombine(*registry);
|
||||||
llvm::initializeInstrumentation(*registry);
|
llvm::initializeInstrumentation(*registry);
|
||||||
llvm::initializeTarget(*registry);
|
llvm::initializeTarget(*registry);
|
||||||
#endif
|
|
||||||
// Early optimizations to try to reduce the total amount of code to
|
// Early optimizations to try to reduce the total amount of code to
|
||||||
// work with if we can
|
// work with if we can
|
||||||
optPM.add(CreateGatherScatterFlattenPass());
|
optPM.add(CreateGatherScatterFlattenPass());
|
||||||
@@ -279,13 +285,11 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
optPM.add(llvm::createConstantPropagationPass());
|
optPM.add(llvm::createConstantPropagationPass());
|
||||||
optPM.add(CreateIntrinsicsOptPass());
|
optPM.add(CreateIntrinsicsOptPass());
|
||||||
|
|
||||||
#if defined(LLVM_2_8)
|
#if defined(LLVM_2_9)
|
||||||
optPM.add(CreateIsCompileTimeConstantPass(true));
|
|
||||||
#elif defined(LLVM_2_9)
|
|
||||||
llvm::createStandardModulePasses(&optPM, 3,
|
llvm::createStandardModulePasses(&optPM, 3,
|
||||||
false /* opt size */,
|
false /* opt size */,
|
||||||
true /* unit at a time */,
|
true /* unit at a time */,
|
||||||
false /* unroll loops */,
|
g->opt.unrollLoops,
|
||||||
true /* simplify lib calls */,
|
true /* simplify lib calls */,
|
||||||
false /* may have exceptions */,
|
false /* may have exceptions */,
|
||||||
llvm::createFunctionInliningPass());
|
llvm::createFunctionInliningPass());
|
||||||
@@ -300,7 +304,7 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
llvm::createStandardModulePasses(&optPM, 3,
|
llvm::createStandardModulePasses(&optPM, 3,
|
||||||
false /* opt size */,
|
false /* opt size */,
|
||||||
true /* unit at a time */,
|
true /* unit at a time */,
|
||||||
false /* unroll loops */,
|
g->opt.unrollLoops,
|
||||||
true /* simplify lib calls */,
|
true /* simplify lib calls */,
|
||||||
false /* may have exceptions */,
|
false /* may have exceptions */,
|
||||||
llvm::createFunctionInliningPass());
|
llvm::createFunctionInliningPass());
|
||||||
@@ -309,6 +313,8 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
llvm::PassManagerBuilder builder;
|
llvm::PassManagerBuilder builder;
|
||||||
builder.OptLevel = 3;
|
builder.OptLevel = 3;
|
||||||
builder.Inliner = llvm::createFunctionInliningPass();
|
builder.Inliner = llvm::createFunctionInliningPass();
|
||||||
|
if (g->opt.unrollLoops == false)
|
||||||
|
builder.DisableUnrollLoops = true;
|
||||||
builder.populateFunctionPassManager(funcPM);
|
builder.populateFunctionPassManager(funcPM);
|
||||||
builder.populateModulePassManager(optPM);
|
builder.populateModulePassManager(optPM);
|
||||||
optPM.add(CreateIsCompileTimeConstantPass(true));
|
optPM.add(CreateIsCompileTimeConstantPass(true));
|
||||||
@@ -421,8 +427,11 @@ IntrinsicsOpt::IntrinsicsOpt()
|
|||||||
blendInstructions.push_back(BlendInstruction(
|
blendInstructions.push_back(BlendInstruction(
|
||||||
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
|
||||||
0xf, 0, 1, 2));
|
0xf, 0, 1, 2));
|
||||||
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
blendInstructions.push_back(BlendInstruction(
|
blendInstructions.push_back(BlendInstruction(
|
||||||
m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_blendv_ps_256),
|
||||||
|
0xff, 0, 1, 2));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -469,8 +478,18 @@ lGetMask(llvm::Value *factor) {
|
|||||||
else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
|
else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
|
||||||
return 0;
|
return 0;
|
||||||
else {
|
else {
|
||||||
|
#if 0
|
||||||
|
llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
|
||||||
|
if (ce != NULL) {
|
||||||
|
llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
|
||||||
|
const llvm::TargetData *td = targetMachine->getTargetData();
|
||||||
|
llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
|
||||||
|
c->dump();
|
||||||
|
factor = c;
|
||||||
|
}
|
||||||
// else we should be able to handle it above...
|
// else we should be able to handle it above...
|
||||||
assert(!llvm::isa<llvm::Constant>(factor));
|
assert(!llvm::isa<llvm::Constant>(factor));
|
||||||
|
#endif
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -608,9 +627,10 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
llvm::PointerType::get(returnType, 0),
|
llvm::PointerType::get(returnType, 0),
|
||||||
"ptr2vec", callInst);
|
"ptr2vec", callInst);
|
||||||
lCopyMetadata(castPtr, callInst);
|
lCopyMetadata(castPtr, callInst);
|
||||||
|
int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
|
||||||
llvm::Instruction *loadInst =
|
llvm::Instruction *loadInst =
|
||||||
new llvm::LoadInst(castPtr, "load", false /* not volatile */,
|
new llvm::LoadInst(castPtr, "load", false /* not volatile */,
|
||||||
0 /* align */, (llvm::Instruction *)NULL);
|
align, (llvm::Instruction *)NULL);
|
||||||
lCopyMetadata(loadInst, callInst);
|
lCopyMetadata(loadInst, callInst);
|
||||||
llvm::ReplaceInstWithInst(callInst, loadInst);
|
llvm::ReplaceInstWithInst(callInst, loadInst);
|
||||||
modifiedAny = true;
|
modifiedAny = true;
|
||||||
@@ -630,17 +650,21 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
}
|
}
|
||||||
else if (mask == 0xff) {
|
else if (mask == 0xff) {
|
||||||
// all lanes storing, so replace with a regular store
|
// all lanes storing, so replace with a regular store
|
||||||
llvm::Value *rvalue = callInst->getArgOperand(1);
|
llvm::Value *rvalue = callInst->getArgOperand(2);
|
||||||
llvm::Type *storeType = rvalue->getType();
|
llvm::Type *storeType = rvalue->getType();
|
||||||
llvm::Value *castPtr =
|
llvm::Value *castPtr =
|
||||||
new llvm::BitCastInst(callInst->getArgOperand(0),
|
new llvm::BitCastInst(callInst->getArgOperand(0),
|
||||||
llvm::PointerType::get(storeType, 0),
|
llvm::PointerType::get(storeType, 0),
|
||||||
"ptr2vec", callInst);
|
"ptr2vec", callInst);
|
||||||
lCopyMetadata(castPtr, callInst);
|
lCopyMetadata(castPtr, callInst);
|
||||||
llvm::Instruction *storeInst =
|
|
||||||
|
llvm::StoreInst *storeInst =
|
||||||
new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
|
new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
|
||||||
|
int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
|
||||||
|
storeInst->setAlignment(align);
|
||||||
lCopyMetadata(storeInst, callInst);
|
lCopyMetadata(storeInst, callInst);
|
||||||
llvm::ReplaceInstWithInst(callInst, storeInst);
|
llvm::ReplaceInstWithInst(callInst, storeInst);
|
||||||
|
|
||||||
modifiedAny = true;
|
modifiedAny = true;
|
||||||
goto restart;
|
goto restart;
|
||||||
}
|
}
|
||||||
@@ -1416,15 +1440,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
llvm::Value *rvalue = callInst->getArgOperand(1);
|
llvm::Value *rvalue = callInst->getArgOperand(1);
|
||||||
llvm::Value *mask = callInst->getArgOperand(2);
|
llvm::Value *mask = callInst->getArgOperand(2);
|
||||||
|
|
||||||
// On SSE, we need to choose between doing the load + blend + store
|
// We need to choose between doing the load + blend + store trick,
|
||||||
// trick, or serializing the masked store. On targets with a
|
// or serializing the masked store. Even on targets with a native
|
||||||
// native masked store instruction, the implementations of
|
// masked store instruction, this is preferable since it lets us
|
||||||
// __masked_store_blend_* should be the same as __masked_store_*,
|
// keep values in registers rather than going out to the stack.
|
||||||
// so this doesn't matter. On SSE, blending is generally more
|
bool doBlend = (!g->opt.disableBlendedMaskedStores ||
|
||||||
// efficient and is always safe to do on stack-allocated values.(?)
|
lIsStackVariablePointer(lvalue));
|
||||||
bool doBlend = lIsStackVariablePointer(lvalue);
|
|
||||||
if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
|
|
||||||
doBlend |= !g->opt.disableBlendedMaskedStores;
|
|
||||||
|
|
||||||
// Generate the call to the appropriate masked store function and
|
// Generate the call to the appropriate masked store function and
|
||||||
// replace the __pseudo_* one with it.
|
// replace the __pseudo_* one with it.
|
||||||
@@ -1502,8 +1523,8 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
|
|||||||
|
|
||||||
|
|
||||||
/** Given an LLVM vector in vec, return a 'scalarized' version of the
|
/** Given an LLVM vector in vec, return a 'scalarized' version of the
|
||||||
vector in the provided offsets[] array. For example, if the vector
|
vector in the provided scalarizedVector[] array. For example, if the
|
||||||
value passed in is:
|
vector value passed in is:
|
||||||
|
|
||||||
add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,
|
add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,
|
||||||
|
|
||||||
@@ -1524,28 +1545,39 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
|
|||||||
@param vec Vector to be scalarized
|
@param vec Vector to be scalarized
|
||||||
@param scalarizedVector Array in which to store the individual vector
|
@param scalarizedVector Array in which to store the individual vector
|
||||||
elements
|
elements
|
||||||
|
@param vectorLength Number of elements in the given vector. (The
|
||||||
|
passed scalarizedVector array must also be at least
|
||||||
|
this length as well.)
|
||||||
@returns True if the vector was successfully scalarized and
|
@returns True if the vector was successfully scalarized and
|
||||||
the values in offsets[] are valid; false otherwise
|
the values in offsets[] are valid; false otherwise
|
||||||
*/
|
*/
|
||||||
static bool
|
static bool
|
||||||
lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC]) {
|
lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
|
||||||
|
int vectorLength) {
|
||||||
// First initialize the values of scalarizedVector[] to NULL.
|
// First initialize the values of scalarizedVector[] to NULL.
|
||||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
for (int i = 0; i < vectorLength; ++i)
|
||||||
scalarizedVector[i] = NULL;
|
scalarizedVector[i] = NULL;
|
||||||
|
|
||||||
|
// It may be ok for the vector to be an undef vector; these come up for
|
||||||
|
// example in shufflevector instructions. As long as elements of the
|
||||||
|
// undef vector aren't referenced by the shuffle indices, this is fine.
|
||||||
|
if (llvm::isa<llvm::UndefValue>(vec))
|
||||||
|
return true;
|
||||||
|
|
||||||
// ConstantVectors are easy; just pull out the individual constant
|
// ConstantVectors are easy; just pull out the individual constant
|
||||||
// element values
|
// element values
|
||||||
llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
|
llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
|
||||||
if (cv != NULL) {
|
if (cv != NULL) {
|
||||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
for (int i = 0; i < vectorLength; ++i)
|
||||||
scalarizedVector[i] = cv->getOperand(i);
|
scalarizedVector[i] = cv->getOperand(i);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// It's also easy if it's just a vector of all zeros
|
// It's also easy if it's just a vector of all zeros
|
||||||
llvm::ConstantAggregateZero *caz = llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
|
llvm::ConstantAggregateZero *caz =
|
||||||
if (caz) {
|
llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
|
||||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
if (caz != NULL) {
|
||||||
|
for (int i = 0; i < vectorLength; ++i)
|
||||||
scalarizedVector[i] = LLVMInt32(0);
|
scalarizedVector[i] = LLVMInt32(0);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -1557,13 +1589,16 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
|
|||||||
// scalar values we return from here are synthesized with scalar
|
// scalar values we return from here are synthesized with scalar
|
||||||
// versions of the original vector binary operator
|
// versions of the original vector binary operator
|
||||||
llvm::Instruction::BinaryOps opcode = bo->getOpcode();
|
llvm::Instruction::BinaryOps opcode = bo->getOpcode();
|
||||||
llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
|
llvm::Value **v0 =
|
||||||
|
(llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
||||||
|
llvm::Value **v1 =
|
||||||
|
(llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
||||||
|
|
||||||
if (!lScalarizeVector(bo->getOperand(0), v0) ||
|
if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength) ||
|
||||||
!lScalarizeVector(bo->getOperand(1), v1))
|
!lScalarizeVector(bo->getOperand(1), v1, vectorLength))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
for (int i = 0; i < g->target.vectorWidth; ++i) {
|
for (int i = 0; i < vectorLength; ++i) {
|
||||||
scalarizedVector[i] =
|
scalarizedVector[i] =
|
||||||
llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
|
llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
|
||||||
lCopyMetadata(scalarizedVector[i], bo);
|
lCopyMetadata(scalarizedVector[i], bo);
|
||||||
@@ -1588,7 +1623,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
|
|||||||
// vaue in scalarizedVector[] based on the value being inserted.
|
// vaue in scalarizedVector[] based on the value being inserted.
|
||||||
while (ie != NULL) {
|
while (ie != NULL) {
|
||||||
uint64_t iOffset = lGetIntValue(ie->getOperand(2));
|
uint64_t iOffset = lGetIntValue(ie->getOperand(2));
|
||||||
assert((int)iOffset < g->target.vectorWidth);
|
assert((int)iOffset < vectorLength);
|
||||||
assert(scalarizedVector[iOffset] == NULL);
|
assert(scalarizedVector[iOffset] == NULL);
|
||||||
|
|
||||||
scalarizedVector[iOffset] = ie->getOperand(1);
|
scalarizedVector[iOffset] = ie->getOperand(1);
|
||||||
@@ -1602,15 +1637,17 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
|
|||||||
}
|
}
|
||||||
|
|
||||||
llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
|
llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
|
||||||
if (ci) {
|
if (ci != NULL) {
|
||||||
// Casts are similar to BinaryOperators in that we attempt to
|
// Casts are similar to BinaryOperators in that we attempt to
|
||||||
// scalarize the vector being cast and if successful, we apply
|
// scalarize the vector being cast and if successful, we apply
|
||||||
// equivalent scalar cast operators to each of the values in the
|
// equivalent scalar cast operators to each of the values in the
|
||||||
// scalarized vector.
|
// scalarized vector.
|
||||||
llvm::Instruction::CastOps op = ci->getOpcode();
|
llvm::Instruction::CastOps op = ci->getOpcode();
|
||||||
|
|
||||||
llvm::Value *scalarizedTarget[ISPC_MAX_NVEC];
|
llvm::Value **scalarizedTarget =
|
||||||
if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget))
|
(llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
||||||
|
if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget,
|
||||||
|
vectorLength))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
|
LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
|
||||||
@@ -1619,7 +1656,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
|
|||||||
assert(vectorDestType != NULL);
|
assert(vectorDestType != NULL);
|
||||||
LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();
|
LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();
|
||||||
|
|
||||||
for (int i = 0; i < g->target.vectorWidth; ++i) {
|
for (int i = 0; i < vectorLength; ++i) {
|
||||||
scalarizedVector[i] =
|
scalarizedVector[i] =
|
||||||
llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
|
llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
|
||||||
"cast", ci);
|
"cast", ci);
|
||||||
@@ -1629,16 +1666,11 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
|
|||||||
}
|
}
|
||||||
|
|
||||||
llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
|
llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
|
||||||
if (svi) {
|
if (svi != NULL) {
|
||||||
// Note that the code for shufflevector instructions is untested.
|
|
||||||
// (We haven't yet had a case where it needs to run). Therefore,
|
|
||||||
// an assert at the bottom of this routien will hit the first time
|
|
||||||
// it runs as a reminder that this needs to be tested further.
|
|
||||||
|
|
||||||
LLVM_TYPE_CONST llvm::VectorType *svInstType =
|
LLVM_TYPE_CONST llvm::VectorType *svInstType =
|
||||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
|
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
|
||||||
assert(svInstType != NULL);
|
assert(svInstType != NULL);
|
||||||
assert((int)svInstType->getNumElements() == g->target.vectorWidth);
|
assert((int)svInstType->getNumElements() == vectorLength);
|
||||||
|
|
||||||
// Scalarize the two vectors being shuffled. First figure out how
|
// Scalarize the two vectors being shuffled. First figure out how
|
||||||
// big they are.
|
// big they are.
|
||||||
@@ -1653,27 +1685,21 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
|
|||||||
int n0 = vectorType0->getNumElements();
|
int n0 = vectorType0->getNumElements();
|
||||||
int n1 = vectorType1->getNumElements();
|
int n1 = vectorType1->getNumElements();
|
||||||
|
|
||||||
// FIXME: It's actually totally legitimate for these two to have
|
|
||||||
// different sizes; the final result just needs to have the native
|
|
||||||
// vector width. To handle this, not only do we need to
|
|
||||||
// potentially dynamically allocate space for the arrays passed
|
|
||||||
// into lScalarizeVector, but we need to change the rest of its
|
|
||||||
// implementation to not key off g->target.vectorWidth everywhere
|
|
||||||
// to get the sizes of the arrays to iterate over, etc.
|
|
||||||
assert(n0 == g->target.vectorWidth && n1 == g->target.vectorWidth);
|
|
||||||
|
|
||||||
// Go ahead and scalarize the two input vectors now.
|
// Go ahead and scalarize the two input vectors now.
|
||||||
// FIXME: it's ok if some or all of the values of these two vectors
|
llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *));
|
||||||
// have undef values, so long as we don't try to access undef
|
llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *));
|
||||||
// values with the vector indices provided to the instruction.
|
|
||||||
// Should fix lScalarizeVector so that it doesn't return false in
|
if (!lScalarizeVector(svi->getOperand(0), v0, n0) ||
|
||||||
// this case and just leaves the elements of the arrays with undef
|
!lScalarizeVector(svi->getOperand(1), v1, n1))
|
||||||
// values as NULL.
|
|
||||||
llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
|
|
||||||
if (!lScalarizeVector(svi->getOperand(0), v0) ||
|
|
||||||
!lScalarizeVector(svi->getOperand(1), v1))
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
llvm::ConstantAggregateZero *caz =
|
||||||
|
llvm::dyn_cast<llvm::ConstantAggregateZero>(svi->getOperand(2));
|
||||||
|
if (caz != NULL) {
|
||||||
|
for (int i = 0; i < vectorLength; ++i)
|
||||||
|
scalarizedVector[i] = v0[0];
|
||||||
|
}
|
||||||
|
else {
|
||||||
llvm::ConstantVector *shuffleIndicesVector =
|
llvm::ConstantVector *shuffleIndicesVector =
|
||||||
llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
|
llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
|
||||||
// I think this has to be a ConstantVector. If this ever hits,
|
// I think this has to be a ConstantVector. If this ever hits,
|
||||||
@@ -1684,15 +1710,15 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
|
|||||||
// Get the integer indices for each element of the returned vector
|
// Get the integer indices for each element of the returned vector
|
||||||
llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
|
llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
|
||||||
shuffleIndicesVector->getVectorElements(shuffleIndices);
|
shuffleIndicesVector->getVectorElements(shuffleIndices);
|
||||||
assert((int)shuffleIndices.size() == g->target.vectorWidth);
|
assert((int)shuffleIndices.size() == vectorLength);
|
||||||
|
|
||||||
// And loop over the indices, setting the i'th element of the
|
// And loop over the indices, setting the i'th element of the
|
||||||
// result vector with the source vector element that corresponds to
|
// result vector with the source vector element that corresponds to
|
||||||
// the i'th shuffle index value.
|
// the i'th shuffle index value.
|
||||||
for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
|
for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
|
||||||
if (!llvm::isa<llvm::ConstantInt>(shuffleIndices[i]))
|
|
||||||
// I'm not sure when this case would ever happen, though..
|
// I'm not sure when this case would ever happen, though..
|
||||||
return false;
|
assert(llvm::isa<llvm::ConstantInt>(shuffleIndices[i]));
|
||||||
|
|
||||||
int offset = (int)lGetIntValue(shuffleIndices[i]);
|
int offset = (int)lGetIntValue(shuffleIndices[i]);
|
||||||
assert(offset >= 0 && offset < n0+n1);
|
assert(offset >= 0 && offset < n0+n1);
|
||||||
|
|
||||||
@@ -1704,7 +1730,45 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
|
|||||||
// vector
|
// vector
|
||||||
scalarizedVector[i] = v1[offset - n0];
|
scalarizedVector[i] = v1[offset - n0];
|
||||||
}
|
}
|
||||||
FATAL("the above code is untested so far; check now that it's actually running");
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm::LoadInst *li = llvm::dyn_cast<llvm::LoadInst>(vec);
|
||||||
|
if (li != NULL) {
|
||||||
|
llvm::Value *baseAddr = li->getOperand(0);
|
||||||
|
llvm::Value *baseInt = new llvm::PtrToIntInst(baseAddr, LLVMTypes::Int64Type,
|
||||||
|
"base2int", li);
|
||||||
|
lCopyMetadata(baseInt, li);
|
||||||
|
|
||||||
|
LLVM_TYPE_CONST llvm::PointerType *ptrType =
|
||||||
|
llvm::dyn_cast<llvm::PointerType>(baseAddr->getType());
|
||||||
|
assert(ptrType != NULL);
|
||||||
|
LLVM_TYPE_CONST llvm::VectorType *vecType =
|
||||||
|
llvm::dyn_cast<llvm::VectorType>(ptrType->getElementType());
|
||||||
|
assert(vecType != NULL);
|
||||||
|
LLVM_TYPE_CONST llvm::Type *elementType = vecType->getElementType();
|
||||||
|
uint64_t elementSize;
|
||||||
|
bool sizeKnown = lSizeOfIfKnown(elementType, &elementSize);
|
||||||
|
assert(sizeKnown == true);
|
||||||
|
|
||||||
|
LLVM_TYPE_CONST llvm::Type *eltPtrType = llvm::PointerType::get(elementType, 0);
|
||||||
|
|
||||||
|
for (int i = 0; i < vectorLength; ++i) {
|
||||||
|
llvm::Value *intPtrOffset =
|
||||||
|
llvm::BinaryOperator::Create(llvm::Instruction::Add, baseInt,
|
||||||
|
LLVMInt64(i * elementSize), "baseoffset",
|
||||||
|
li);
|
||||||
|
lCopyMetadata(intPtrOffset, li);
|
||||||
|
llvm::Value *scalarLoadPtr =
|
||||||
|
new llvm::IntToPtrInst(intPtrOffset, eltPtrType, "int2ptr", li);
|
||||||
|
lCopyMetadata(scalarLoadPtr, li);
|
||||||
|
|
||||||
|
llvm::Instruction *scalarLoad =
|
||||||
|
new llvm::LoadInst(scalarLoadPtr, "loadelt", li);
|
||||||
|
lCopyMetadata(scalarLoad, li);
|
||||||
|
scalarizedVector[i] = scalarLoad;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2116,11 +2180,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
|
if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
|
||||||
base = ce->getOperand(0);
|
base = ce->getOperand(0);
|
||||||
|
|
||||||
// Try to out the offsets; the i'th element of the offsetElements
|
// Try to find out the offsets; the i'th element of the
|
||||||
// array should be an i32 with the value of the offset for the i'th
|
// offsetElements array should be an i32 with the value of the
|
||||||
// vector lane. This may fail; if so, just give up.
|
// offset for the i'th vector lane. This may fail; if so, just
|
||||||
|
// give up.
|
||||||
|
llvm::Value *vecValue = callInst->getArgOperand(1);
|
||||||
|
LLVM_TYPE_CONST llvm::VectorType *vt =
|
||||||
|
llvm::dyn_cast<llvm::VectorType>(vecValue->getType());
|
||||||
|
assert(vt != NULL);
|
||||||
|
int vecLength = vt->getNumElements();
|
||||||
|
assert(vecLength == g->target.vectorWidth);
|
||||||
llvm::Value *offsetElements[ISPC_MAX_NVEC];
|
llvm::Value *offsetElements[ISPC_MAX_NVEC];
|
||||||
if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
|
if (!lScalarizeVector(vecValue, offsetElements, vecLength))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
|
llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
|
||||||
@@ -2497,7 +2568,7 @@ llvm::RegisterPass<MakeInternalFuncsStaticPass>
|
|||||||
bool
|
bool
|
||||||
MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
|
MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
|
||||||
const char *names[] = {
|
const char *names[] = {
|
||||||
"__do_print",
|
"__do_print", "__fast_masked_vload", "__num_cores",
|
||||||
"__gather_base_offsets_i8", "__gather_base_offsets_i16",
|
"__gather_base_offsets_i8", "__gather_base_offsets_i16",
|
||||||
"__gather_base_offsets_i32", "__gather_base_offsets_i64",
|
"__gather_base_offsets_i32", "__gather_base_offsets_i64",
|
||||||
"__gather_elt_8", "__gather_elt_16",
|
"__gather_elt_8", "__gather_elt_16",
|
||||||
|
|||||||
57
parse.yy
57
parse.yy
@@ -165,7 +165,7 @@ static const char *lParamListTokens[] = {
|
|||||||
%token TOKEN_CBREAK TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT
|
%token TOKEN_CBREAK TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT
|
||||||
|
|
||||||
%type <expr> primary_expression postfix_expression
|
%type <expr> primary_expression postfix_expression
|
||||||
%type <expr> unary_expression cast_expression
|
%type <expr> unary_expression cast_expression launch_expression
|
||||||
%type <expr> multiplicative_expression additive_expression shift_expression
|
%type <expr> multiplicative_expression additive_expression shift_expression
|
||||||
%type <expr> relational_expression equality_expression and_expression
|
%type <expr> relational_expression equality_expression and_expression
|
||||||
%type <expr> exclusive_or_expression inclusive_or_expression
|
%type <expr> exclusive_or_expression inclusive_or_expression
|
||||||
@@ -177,6 +177,7 @@ static const char *lParamListTokens[] = {
|
|||||||
%type <stmt> statement labeled_statement compound_statement for_init_statement
|
%type <stmt> statement labeled_statement compound_statement for_init_statement
|
||||||
%type <stmt> expression_statement selection_statement iteration_statement
|
%type <stmt> expression_statement selection_statement iteration_statement
|
||||||
%type <stmt> jump_statement statement_list declaration_statement print_statement
|
%type <stmt> jump_statement statement_list declaration_statement print_statement
|
||||||
|
%type <stmt> sync_statement
|
||||||
|
|
||||||
%type <declaration> declaration parameter_declaration
|
%type <declaration> declaration parameter_declaration
|
||||||
%type <declarators> init_declarator_list
|
%type <declarators> init_declarator_list
|
||||||
@@ -221,7 +222,7 @@ primary_expression
|
|||||||
else {
|
else {
|
||||||
std::vector<Symbol *> *funs = m->symbolTable->LookupFunction(name);
|
std::vector<Symbol *> *funs = m->symbolTable->LookupFunction(name);
|
||||||
if (funs)
|
if (funs)
|
||||||
$$ = new FunctionSymbolExpr(funs, @1);
|
$$ = new FunctionSymbolExpr(name, funs, @1);
|
||||||
}
|
}
|
||||||
if ($$ == NULL) {
|
if ($$ == NULL) {
|
||||||
std::vector<std::string> alternates =
|
std::vector<std::string> alternates =
|
||||||
@@ -256,18 +257,32 @@ primary_expression
|
|||||||
| '(' expression ')' { $$ = $2; }
|
| '(' expression ')' { $$ = $2; }
|
||||||
;
|
;
|
||||||
|
|
||||||
|
launch_expression
|
||||||
|
: TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
|
||||||
|
{
|
||||||
|
ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
|
||||||
|
$$ = new FunctionCallExpr($3, $5, @3, true, oneExpr);
|
||||||
|
}
|
||||||
|
| TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
|
||||||
|
{
|
||||||
|
ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
|
||||||
|
$$ = new FunctionCallExpr($3, new ExprList(@3), @3, true, oneExpr);
|
||||||
|
}
|
||||||
|
| TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
|
||||||
|
{ $$ = new FunctionCallExpr($6, $8, @6, true, $3); }
|
||||||
|
| TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>'
|
||||||
|
{ $$ = new FunctionCallExpr($6, new ExprList(@6), @6, true, $3); }
|
||||||
|
;
|
||||||
|
|
||||||
postfix_expression
|
postfix_expression
|
||||||
: primary_expression
|
: primary_expression
|
||||||
| postfix_expression '[' expression ']'
|
| postfix_expression '[' expression ']'
|
||||||
{ $$ = new IndexExpr($1, $3, @1); }
|
{ $$ = new IndexExpr($1, $3, @1); }
|
||||||
| postfix_expression '(' ')'
|
| postfix_expression '(' ')'
|
||||||
{ $$ = new FunctionCallExpr($1, new ExprList(@1), @1, false); }
|
{ $$ = new FunctionCallExpr($1, new ExprList(@1), @1); }
|
||||||
| postfix_expression '(' argument_expression_list ')'
|
| postfix_expression '(' argument_expression_list ')'
|
||||||
{ $$ = new FunctionCallExpr($1, $3, @1, false); }
|
{ $$ = new FunctionCallExpr($1, $3, @1); }
|
||||||
| TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
|
| launch_expression
|
||||||
{ $$ = new FunctionCallExpr($3, $5, @3, true); }
|
|
||||||
| TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
|
|
||||||
{ $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true); }
|
|
||||||
| postfix_expression '.' TOKEN_IDENTIFIER
|
| postfix_expression '.' TOKEN_IDENTIFIER
|
||||||
{ $$ = MemberExpr::create($1, yytext, @1, @3); }
|
{ $$ = MemberExpr::create($1, yytext, @1, @3); }
|
||||||
/* | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
|
/* | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
|
||||||
@@ -436,8 +451,6 @@ assignment_expression
|
|||||||
|
|
||||||
expression
|
expression
|
||||||
: assignment_expression
|
: assignment_expression
|
||||||
| TOKEN_SYNC
|
|
||||||
{ $$ = new SyncExpr(@1); }
|
|
||||||
| expression ',' assignment_expression
|
| expression ',' assignment_expression
|
||||||
{ $$ = new BinaryExpr(BinaryExpr::Comma, $1, $3, @2); }
|
{ $$ = new BinaryExpr(BinaryExpr::Comma, $1, $3, @2); }
|
||||||
;
|
;
|
||||||
@@ -928,9 +941,13 @@ parameter_list
|
|||||||
builtinTokens.push_back(*token);
|
builtinTokens.push_back(*token);
|
||||||
++token;
|
++token;
|
||||||
}
|
}
|
||||||
|
if (strlen(yytext) == 0)
|
||||||
|
Error(@1, "Syntax error--premature end of file.");
|
||||||
|
else {
|
||||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||||
std::string alts = lGetAlternates(alternates);
|
std::string alts = lGetAlternates(alternates);
|
||||||
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
||||||
|
}
|
||||||
$$ = NULL;
|
$$ = NULL;
|
||||||
}
|
}
|
||||||
;
|
;
|
||||||
@@ -1019,6 +1036,7 @@ statement
|
|||||||
| jump_statement
|
| jump_statement
|
||||||
| declaration_statement
|
| declaration_statement
|
||||||
| print_statement
|
| print_statement
|
||||||
|
| sync_statement
|
||||||
| error
|
| error
|
||||||
{
|
{
|
||||||
std::vector<std::string> builtinTokens;
|
std::vector<std::string> builtinTokens;
|
||||||
@@ -1027,9 +1045,13 @@ statement
|
|||||||
builtinTokens.push_back(*token);
|
builtinTokens.push_back(*token);
|
||||||
++token;
|
++token;
|
||||||
}
|
}
|
||||||
|
if (strlen(yytext) == 0)
|
||||||
|
Error(@1, "Syntax error--premature end of file.");
|
||||||
|
else {
|
||||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||||
std::string alts = lGetAlternates(alternates);
|
std::string alts = lGetAlternates(alternates);
|
||||||
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
||||||
|
}
|
||||||
$$ = NULL;
|
$$ = NULL;
|
||||||
}
|
}
|
||||||
;
|
;
|
||||||
@@ -1155,6 +1177,11 @@ jump_statement
|
|||||||
{ $$ = new ReturnStmt($2, true, @1); }
|
{ $$ = new ReturnStmt($2, true, @1); }
|
||||||
;
|
;
|
||||||
|
|
||||||
|
sync_statement
|
||||||
|
: TOKEN_SYNC
|
||||||
|
{ $$ = new ExprStmt(new SyncExpr(@1), @1); }
|
||||||
|
;
|
||||||
|
|
||||||
print_statement
|
print_statement
|
||||||
: TOKEN_PRINT '(' string_constant ')'
|
: TOKEN_PRINT '(' string_constant ')'
|
||||||
{
|
{
|
||||||
@@ -1177,10 +1204,14 @@ translation_unit
|
|||||||
builtinTokens.push_back(*token);
|
builtinTokens.push_back(*token);
|
||||||
++token;
|
++token;
|
||||||
}
|
}
|
||||||
|
if (strlen(yytext) == 0)
|
||||||
|
Error(@1, "Syntax error--premature end of file.");
|
||||||
|
else {
|
||||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||||
std::string alts = lGetAlternates(alternates);
|
std::string alts = lGetAlternates(alternates);
|
||||||
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
;
|
;
|
||||||
|
|
||||||
external_declaration
|
external_declaration
|
||||||
@@ -1266,6 +1297,12 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) {
|
|||||||
|
|
||||||
Symbol *threadCountSym = new Symbol("threadCount", pos, AtomicType::UniformConstUInt32);
|
Symbol *threadCountSym = new Symbol("threadCount", pos, AtomicType::UniformConstUInt32);
|
||||||
m->symbolTable->AddVariable(threadCountSym);
|
m->symbolTable->AddVariable(threadCountSym);
|
||||||
|
|
||||||
|
Symbol *taskIndexSym = new Symbol("taskIndex", pos, AtomicType::UniformConstUInt32);
|
||||||
|
m->symbolTable->AddVariable(taskIndexSym);
|
||||||
|
|
||||||
|
Symbol *taskCountSym = new Symbol("taskCount", pos, AtomicType::UniformConstUInt32);
|
||||||
|
m->symbolTable->AddVariable(taskCountSym);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
218
run_tests.py
Executable file
218
run_tests.py
Executable file
@@ -0,0 +1,218 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
# test-running driver for ispc
|
||||||
|
|
||||||
|
# TODO: windows support (mostly should be calling CL.exe rather than gcc
|
||||||
|
# for static linking?)
|
||||||
|
|
||||||
|
from optparse import OptionParser
|
||||||
|
import multiprocessing
|
||||||
|
from ctypes import c_int
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import glob
|
||||||
|
import re
|
||||||
|
import signal
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
import mutex
|
||||||
|
import subprocess
|
||||||
|
import platform
|
||||||
|
|
||||||
|
parser = OptionParser()
|
||||||
|
parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
|
||||||
|
default=False, action="store_true")
|
||||||
|
parser.add_option("-s", "--static-exe", dest="static_exe",
|
||||||
|
help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
|
||||||
|
default=False, action="store_true")
|
||||||
|
parser.add_option('-t', '--target', dest='target',
|
||||||
|
help='Set compilation target (sse2, sse4, sse4x2, avx, avx-x2)',
|
||||||
|
default="sse4")
|
||||||
|
parser.add_option('-a', '--arch', dest='arch',
|
||||||
|
help='Set architecture (x86, x86-64)',
|
||||||
|
default="x86-64")
|
||||||
|
parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
|
||||||
|
default=False, action="store_true")
|
||||||
|
|
||||||
|
(options, args) = parser.parse_args()
|
||||||
|
|
||||||
|
# if no specific test files are specified, run all of the tests in tests/
|
||||||
|
# and failing_tests/
|
||||||
|
if len(args) == 0:
|
||||||
|
files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc")
|
||||||
|
else:
|
||||||
|
files = args
|
||||||
|
|
||||||
|
# randomly shuffle the tests if asked to do so
|
||||||
|
if (options.random):
|
||||||
|
random.seed()
|
||||||
|
random.shuffle(files)
|
||||||
|
|
||||||
|
# counter
|
||||||
|
total_tests = 0
|
||||||
|
finished_tests_counter = multiprocessing.Value(c_int)
|
||||||
|
|
||||||
|
# We'd like to use the Lock class from the multiprocessing package to
|
||||||
|
# serialize accesses to finished_tests_counter. Unfortunately, the version of
|
||||||
|
# python that ships with OSX 10.5 has this bug:
|
||||||
|
# http://bugs.python.org/issue5261. Therefore, we use the (deprecated but
|
||||||
|
# still available) mutex class.
|
||||||
|
#finished_tests_counter_lock = multiprocessing.Lock()
|
||||||
|
finished_tests_mutex = mutex.mutex()
|
||||||
|
|
||||||
|
# utility routine to print an update on the number of tests that have been
|
||||||
|
# finished. Should be called with the mutex (or lock) held..
|
||||||
|
def update_progress(fn):
|
||||||
|
finished_tests_counter.value = finished_tests_counter.value + 1
|
||||||
|
progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
|
||||||
|
# spaces to clear out detrius from previous printing...
|
||||||
|
for x in range(30):
|
||||||
|
progress_str += ' '
|
||||||
|
progress_str += '\r'
|
||||||
|
sys.stdout.write(progress_str)
|
||||||
|
sys.stdout.flush()
|
||||||
|
finished_tests_mutex.unlock()
|
||||||
|
|
||||||
|
fnull = open(os.devnull, 'w')
|
||||||
|
|
||||||
|
# run the commands in cmd_list
|
||||||
|
def run_cmds(cmd_list, filename, expect_failure):
|
||||||
|
for cmd in cmd_list:
|
||||||
|
if expect_failure:
|
||||||
|
failed = (subprocess.call(cmd, shell = True, stdout = fnull, stderr = fnull) != 0)
|
||||||
|
else:
|
||||||
|
failed = (os.system(cmd) != 0)
|
||||||
|
if failed:
|
||||||
|
break
|
||||||
|
|
||||||
|
surprise = ((expect_failure and not failed) or (not expect_failure and failed))
|
||||||
|
if surprise == True:
|
||||||
|
print "Test %s %s " % \
|
||||||
|
(filename, "unexpectedly passed" if expect_failure else "failed")
|
||||||
|
return surprise
|
||||||
|
|
||||||
|
|
||||||
|
# pull tests to run from the given queue and run them. Multiple copies of
|
||||||
|
# this function will be running in parallel across all of the CPU cores of
|
||||||
|
# the system.
|
||||||
|
def run_tasks_from_queue(queue):
|
||||||
|
error_count = 0
|
||||||
|
while True:
|
||||||
|
filename = queue.get()
|
||||||
|
if (filename == 'STOP'):
|
||||||
|
sys.exit(error_count)
|
||||||
|
|
||||||
|
# do we expect this test to fail?
|
||||||
|
should_fail = (filename.find("failing_") != -1)
|
||||||
|
|
||||||
|
if options.static_exe == True:
|
||||||
|
# if the user wants us to build a static executable to run for
|
||||||
|
# this test, we need to figure out the signature of the test
|
||||||
|
# function that this test has.
|
||||||
|
sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3,
|
||||||
|
"f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
|
||||||
|
file = open(filename, 'r')
|
||||||
|
match = -1
|
||||||
|
for line in file:
|
||||||
|
# look for lines with 'export'...
|
||||||
|
if line.find("export") == -1:
|
||||||
|
continue
|
||||||
|
# one of them should have a function with one of the
|
||||||
|
# declarations in sig2def
|
||||||
|
for pattern, ident in sig2def.items():
|
||||||
|
if line.find(pattern) != -1:
|
||||||
|
match = ident
|
||||||
|
break
|
||||||
|
file.close()
|
||||||
|
if match == -1:
|
||||||
|
print "Fatal error: unable to find function signature in test %s" % filename
|
||||||
|
error_count += 1
|
||||||
|
else:
|
||||||
|
obj_name = "%s.o" % filename
|
||||||
|
exe_name = "%s.run" % filename
|
||||||
|
ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
|
||||||
|
(filename, obj_name, options.arch, options.target)
|
||||||
|
if options.no_opt:
|
||||||
|
ispc_cmd += " -O0"
|
||||||
|
if options.arch == 'x86':
|
||||||
|
gcc_arch = '-m32'
|
||||||
|
else:
|
||||||
|
gcc_arch = '-m64'
|
||||||
|
gcc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
|
||||||
|
(gcc_arch, match, filename, exe_name)
|
||||||
|
if platform.system() == 'Darwin':
|
||||||
|
gcc_cmd += ' -Wl,-no_pie'
|
||||||
|
if should_fail:
|
||||||
|
gcc_cmd += " -DEXPECT_FAILURE"
|
||||||
|
|
||||||
|
# compile the ispc code, make the executable, and run it...
|
||||||
|
error_count += run_cmds([ispc_cmd, gcc_cmd, exe_name], filename, should_fail)
|
||||||
|
|
||||||
|
# clean up after running the test
|
||||||
|
try:
|
||||||
|
os.unlink(exe_name)
|
||||||
|
os.unlink(obj_name)
|
||||||
|
except:
|
||||||
|
None
|
||||||
|
else:
|
||||||
|
# otherwise we'll use ispc_test + the LLVM JIT to run the test
|
||||||
|
bitcode_file = "%s.bc" % filename
|
||||||
|
compile_cmd = "ispc --woff --emit-llvm %s --target=%s -o %s" % \
|
||||||
|
(filename, options.target, bitcode_file)
|
||||||
|
if options.no_opt:
|
||||||
|
compile_cmd += " -O0"
|
||||||
|
test_cmd = "ispc_test %s" % bitcode_file
|
||||||
|
|
||||||
|
error_count += run_cmds([compile_cmd, test_cmd], filename, should_fail)
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.unlink(bitcode_file)
|
||||||
|
except:
|
||||||
|
None
|
||||||
|
|
||||||
|
# If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
|
||||||
|
#with finished_tests_counter_lock:
|
||||||
|
#update_progress(filename)
|
||||||
|
# but instead we do this...
|
||||||
|
finished_tests_mutex.lock(update_progress, filename)
|
||||||
|
|
||||||
|
|
||||||
|
task_threads = []
|
||||||
|
|
||||||
|
def sigint(signum, frame):
|
||||||
|
for t in task_threads:
|
||||||
|
t.terminate()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
nthreads = multiprocessing.cpu_count()
|
||||||
|
total_tests = len(files)
|
||||||
|
print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
|
||||||
|
|
||||||
|
# put each of the test filenames into a queue
|
||||||
|
q = multiprocessing.Queue()
|
||||||
|
for fn in files:
|
||||||
|
q.put(fn)
|
||||||
|
for x in range(nthreads):
|
||||||
|
q.put('STOP')
|
||||||
|
|
||||||
|
# need to catch sigint so that we can terminate all of the tasks if
|
||||||
|
# we're interrupted
|
||||||
|
signal.signal(signal.SIGINT, sigint)
|
||||||
|
|
||||||
|
# launch jobs to run tests
|
||||||
|
for x in range(nthreads):
|
||||||
|
t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,))
|
||||||
|
task_threads.append(t)
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
# wait for them to all finish and then return the number that failed
|
||||||
|
# (i.e. return 0 if all is ok)
|
||||||
|
error_count = 0
|
||||||
|
for t in task_threads:
|
||||||
|
t.join()
|
||||||
|
error_count += t.exitcode
|
||||||
|
print
|
||||||
|
if error_count > 0:
|
||||||
|
print "%d / %d tests FAILED!" % (error_count, total_tests)
|
||||||
|
sys.exit(error_count)
|
||||||
166
stdlib.ispc
166
stdlib.ispc
@@ -369,7 +369,7 @@ static inline uniform float reduce_min(float v) {
|
|||||||
static inline uniform float reduce_max(float v) {
|
static inline uniform float reduce_max(float v) {
|
||||||
// For the lanes where the mask is off, replace the given value with
|
// For the lanes where the mask is off, replace the given value with
|
||||||
// negative infinity, so that it doesn't affect the result.
|
// negative infinity, so that it doesn't affect the result.
|
||||||
const uniform int iflt_neg_max = 0xff800000; // -infinity
|
const int iflt_neg_max = 0xff800000; // -infinity
|
||||||
// Must use __floatbits_varying_int32, not floatbits(), since with the
|
// Must use __floatbits_varying_int32, not floatbits(), since with the
|
||||||
// latter the current mask enters into the returned result...
|
// latter the current mask enters into the returned result...
|
||||||
return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
|
return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
|
||||||
@@ -427,7 +427,7 @@ static inline uniform double reduce_min(double v) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform double reduce_max(double v) {
|
static inline uniform double reduce_max(double v) {
|
||||||
const uniform int64 iflt_neg_max = 0xfff0000000000000; // -infinity
|
const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
|
||||||
// Must use __doublebits_varying_int64, not doublebits(), since with the
|
// Must use __doublebits_varying_int64, not doublebits(), since with the
|
||||||
// latter the current mask enters into the returned result...
|
// latter the current mask enters into the returned result...
|
||||||
return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
|
return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
|
||||||
@@ -471,21 +471,21 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
|
|||||||
return __reduce_max_uint64(__mask ? v : 0);
|
return __reduce_max_uint64(__mask ? v : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define REDUCE_EQUAL(TYPE, FUNCTYPE) \
|
#define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE) \
|
||||||
static inline uniform bool reduce_equal(TYPE v) { \
|
static inline uniform bool reduce_equal(TYPE v) { \
|
||||||
uniform TYPE unusedValue; \
|
uniform TYPE unusedValue; \
|
||||||
return __reduce_equal_##FUNCTYPE(v, unusedValue, (int32)__mask); \
|
return __reduce_equal_##FUNCTYPE(v, unusedValue, (MASKTYPE)__mask); \
|
||||||
} \
|
} \
|
||||||
static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
|
static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
|
||||||
return __reduce_equal_##FUNCTYPE(v, value, (int32)__mask); \
|
return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask); \
|
||||||
}
|
}
|
||||||
|
|
||||||
REDUCE_EQUAL(int32, int32)
|
REDUCE_EQUAL(int32, int32, int32)
|
||||||
REDUCE_EQUAL(unsigned int32, int32)
|
REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
|
||||||
REDUCE_EQUAL(float, float)
|
REDUCE_EQUAL(float, float, int32)
|
||||||
REDUCE_EQUAL(int64, int64)
|
REDUCE_EQUAL(int64, int64, int32)
|
||||||
REDUCE_EQUAL(unsigned int64, int64)
|
REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
|
||||||
REDUCE_EQUAL(double, double)
|
REDUCE_EQUAL(double, double, int32)
|
||||||
|
|
||||||
static int32 exclusive_scan_add(int32 v) {
|
static int32 exclusive_scan_add(int32 v) {
|
||||||
return __exclusive_scan_add_i32(v, (int32)__mask);
|
return __exclusive_scan_add_i32(v, (int32)__mask);
|
||||||
@@ -549,23 +549,32 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
|
|||||||
static inline uniform int
|
static inline uniform int
|
||||||
packed_load_active(uniform unsigned int a[], uniform int start,
|
packed_load_active(uniform unsigned int a[], uniform int start,
|
||||||
reference unsigned int vals) {
|
reference unsigned int vals) {
|
||||||
return __packed_load_active(a, start, vals, __mask);
|
return __packed_load_active(a, (unsigned int)start, vals,
|
||||||
|
(unsigned int32)__mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int
|
static inline uniform int
|
||||||
packed_store_active(uniform unsigned int a[], uniform int start,
|
packed_store_active(uniform unsigned int a[], uniform int start,
|
||||||
unsigned int vals) {
|
unsigned int vals) {
|
||||||
return __packed_store_active(a, start, vals, __mask);
|
return __packed_store_active(a, (unsigned int)start, vals,
|
||||||
|
(unsigned int32)__mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int packed_load_active(uniform int a[], uniform int start,
|
static inline uniform int packed_load_active(uniform int a[], uniform int start,
|
||||||
reference int vals) {
|
reference int vals) {
|
||||||
return __packed_load_active(a, start, vals, __mask);
|
return __packed_load_active(a, start, vals, (int32)__mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int packed_store_active(uniform int a[], uniform int start,
|
static inline uniform int packed_store_active(uniform int a[], uniform int start,
|
||||||
int vals) {
|
int vals) {
|
||||||
return __packed_store_active(a, start, vals, __mask);
|
return __packed_store_active(a, start, vals, (int32)__mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// System information
|
||||||
|
|
||||||
|
static inline int num_cores() {
|
||||||
|
return __num_cores();
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
@@ -575,75 +584,108 @@ static inline void memory_barrier() {
|
|||||||
__memory_barrier();
|
__memory_barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB) \
|
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
|
||||||
static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
|
static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
TA ret = __atomic_##OPB##_##TB##_global(ref, value, __mask); \
|
TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
|
||||||
|
memory_barrier(); \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
|
||||||
|
uniform TA value) { \
|
||||||
|
memory_barrier(); \
|
||||||
|
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
return ret; \
|
return ret; \
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFINE_ATOMIC_OP(int32,int32,add,add)
|
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
|
||||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub)
|
static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
|
||||||
DEFINE_ATOMIC_OP(int32,int32,min,min)
|
uniform TA oneval = reduce_##OPA(value); \
|
||||||
DEFINE_ATOMIC_OP(int32,int32,max,max)
|
TA ret; \
|
||||||
DEFINE_ATOMIC_OP(int32,int32,and,and)
|
if (lanemask() != 0) { \
|
||||||
DEFINE_ATOMIC_OP(int32,int32,or,or)
|
memory_barrier(); \
|
||||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor)
|
ret = __atomic_##OPB##_uniform_##TB##_global(ref, oneval, (MASKTYPE)__mask); \
|
||||||
DEFINE_ATOMIC_OP(int32,int32,swap,swap)
|
memory_barrier(); \
|
||||||
|
} \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
|
||||||
|
uniform TA value) { \
|
||||||
|
memory_barrier(); \
|
||||||
|
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
|
||||||
|
memory_barrier(); \
|
||||||
|
return ret; \
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
|
||||||
|
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
|
||||||
|
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
|
||||||
|
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
|
||||||
|
DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
|
||||||
|
DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
|
||||||
|
DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
|
||||||
|
DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)
|
||||||
|
|
||||||
// For everything but atomic min and max, we can use the same
|
// For everything but atomic min and max, we can use the same
|
||||||
// implementations for unsigned as for signed.
|
// implementations for unsigned as for signed.
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,uint32,min,umin)
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,uint32,max,umax)
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)
|
||||||
|
|
||||||
DEFINE_ATOMIC_OP(float,float,swap,swap)
|
DEFINE_ATOMIC_OP(float,float,swap,swap,int32)
|
||||||
|
|
||||||
DEFINE_ATOMIC_OP(int64,int64,add,add)
|
DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub)
|
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,min,min)
|
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,max,max)
|
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,and,and)
|
DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,or,or)
|
DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor)
|
DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,swap,swap)
|
DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)
|
||||||
|
|
||||||
// For everything but atomic min and max, we can use the same
|
// For everything but atomic min and max, we can use the same
|
||||||
// implementations for unsigned as for signed.
|
// implementations for unsigned as for signed.
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,uint64,min,umin)
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,uint64,max,umax)
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)
|
||||||
|
|
||||||
DEFINE_ATOMIC_OP(double,double,swap,swap)
|
DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
|
||||||
|
|
||||||
#undef DEFINE_ATOMIC_OP
|
#undef DEFINE_ATOMIC_OP
|
||||||
|
|
||||||
#define ATOMIC_DECL_CMPXCHG(TA, TB) \
|
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
||||||
static inline TA atomic_compare_exchange_global( \
|
static inline TA atomic_compare_exchange_global( \
|
||||||
uniform reference TA ref, TA oldval, TA newval) { \
|
uniform reference TA ref, TA oldval, TA newval) { \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, __mask); \
|
TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
|
||||||
|
memory_barrier(); \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
static inline uniform TA atomic_compare_exchange_global( \
|
||||||
|
uniform reference TA ref, uniform TA oldval, uniform TA newval) { \
|
||||||
|
memory_barrier(); \
|
||||||
|
uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
|
||||||
memory_barrier(); \
|
memory_barrier(); \
|
||||||
return ret; \
|
return ret; \
|
||||||
}
|
}
|
||||||
|
|
||||||
ATOMIC_DECL_CMPXCHG(int32, int32)
|
ATOMIC_DECL_CMPXCHG(int32, int32, int32)
|
||||||
ATOMIC_DECL_CMPXCHG(unsigned int32, int32)
|
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
|
||||||
ATOMIC_DECL_CMPXCHG(float, float)
|
ATOMIC_DECL_CMPXCHG(float, float, int32)
|
||||||
ATOMIC_DECL_CMPXCHG(int64, int64)
|
ATOMIC_DECL_CMPXCHG(int64, int64, int32)
|
||||||
ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
|
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
|
||||||
ATOMIC_DECL_CMPXCHG(double, double)
|
ATOMIC_DECL_CMPXCHG(double, double, int32)
|
||||||
|
|
||||||
#undef ATOMIC_DECL_CMPXCHG
|
#undef ATOMIC_DECL_CMPXCHG
|
||||||
|
|
||||||
@@ -2850,6 +2892,12 @@ static inline void seed_rng(reference uniform RNGState state, uniform unsigned i
|
|||||||
seed = __seed4(state, 0, seed);
|
seed = __seed4(state, 0, seed);
|
||||||
if (programCount == 8)
|
if (programCount == 8)
|
||||||
__seed4(state, 4, seed ^ 0xbeeff00d);
|
__seed4(state, 4, seed ^ 0xbeeff00d);
|
||||||
|
if (programCount == 16) {
|
||||||
|
__seed4(state, 4, seed ^ 0xbeeff00d);
|
||||||
|
__seed4(state, 8, ((seed & 0xffff) << 16) | (seed >> 16));
|
||||||
|
__seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) |
|
||||||
|
((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void fastmath() {
|
static inline void fastmath() {
|
||||||
|
|||||||
433
stmt.cpp
433
stmt.cpp
@@ -107,6 +107,12 @@ ExprStmt::Print(int indent) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
ExprStmt::EstimateCost() const {
|
||||||
|
return expr ? expr->EstimateCost() : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// DeclStmt
|
// DeclStmt
|
||||||
|
|
||||||
@@ -399,12 +405,25 @@ DeclStmt::Print(int indent) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
DeclStmt::EstimateCost() const {
|
||||||
|
int cost = 0;
|
||||||
|
for (unsigned int i = 0; i < declaration->declarators.size(); ++i)
|
||||||
|
if (declaration->declarators[i]->initExpr)
|
||||||
|
cost += declaration->declarators[i]->initExpr->EstimateCost();
|
||||||
|
return cost;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// IfStmt
|
// IfStmt
|
||||||
|
|
||||||
IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool doUnif, SourcePos p)
|
IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool checkCoherence, SourcePos p)
|
||||||
: Stmt(p), test(t), trueStmts(ts), falseStmts(fs),
|
: Stmt(p), test(t), trueStmts(ts), falseStmts(fs),
|
||||||
doCoherentCheck(doUnif && !g->opt.disableCoherentControlFlow) {
|
doAllCheck(checkCoherence &&
|
||||||
|
!g->opt.disableCoherentControlFlow),
|
||||||
|
doAnyCheck(test->GetType() != NULL &&
|
||||||
|
test->GetType()->IsVaryingType()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -436,23 +455,26 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
|
|
||||||
ctx->SetDebugPos(pos);
|
ctx->SetDebugPos(pos);
|
||||||
bool isUniform = testType->IsUniformType();
|
bool isUniform = testType->IsUniformType();
|
||||||
|
|
||||||
|
llvm::Value *testValue = test->GetValue(ctx);
|
||||||
|
if (testValue == NULL)
|
||||||
|
return;
|
||||||
|
|
||||||
if (isUniform) {
|
if (isUniform) {
|
||||||
ctx->StartUniformIf(ctx->GetMask());
|
ctx->StartUniformIf(ctx->GetMask());
|
||||||
if (doCoherentCheck)
|
if (doAllCheck)
|
||||||
Warning(test->pos, "Uniform condition supplied to cif statement.");
|
Warning(test->pos, "Uniform condition supplied to \"cif\" statement.");
|
||||||
|
|
||||||
// 'If' statements with uniform conditions are relatively
|
// 'If' statements with uniform conditions are relatively
|
||||||
// straightforward. We evaluate the condition and then jump to
|
// straightforward. We evaluate the condition and then jump to
|
||||||
// either the 'then' or 'else' clause depending on its value.
|
// either the 'then' or 'else' clause depending on its value.
|
||||||
llvm::Value *vtest = test->GetValue(ctx);
|
|
||||||
if (vtest != NULL) {
|
|
||||||
llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
|
llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
|
||||||
llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
|
llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
|
||||||
llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");
|
llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");
|
||||||
|
|
||||||
// Jump to the appropriate basic block based on the value of
|
// Jump to the appropriate basic block based on the value of
|
||||||
// the 'if' test
|
// the 'if' test
|
||||||
ctx->BranchInst(bthen, belse, vtest);
|
ctx->BranchInst(bthen, belse, testValue);
|
||||||
|
|
||||||
// Emit code for the 'true' case
|
// Emit code for the 'true' case
|
||||||
ctx->SetCurrentBasicBlock(bthen);
|
ctx->SetCurrentBasicBlock(bthen);
|
||||||
@@ -469,29 +491,10 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
// Set the active basic block to the newly-created exit block
|
// Set the active basic block to the newly-created exit block
|
||||||
// so that subsequent emitted code starts there.
|
// so that subsequent emitted code starts there.
|
||||||
ctx->SetCurrentBasicBlock(bexit);
|
ctx->SetCurrentBasicBlock(bexit);
|
||||||
}
|
|
||||||
ctx->EndIf();
|
ctx->EndIf();
|
||||||
}
|
}
|
||||||
else {
|
else
|
||||||
// Code for 'If' statemnts with 'varying' conditions can be
|
emitVaryingIf(ctx, testValue);
|
||||||
// generated in two ways; one takes some care to see if all of the
|
|
||||||
// active program instances want to follow only the 'true' or
|
|
||||||
// 'false' cases, and the other always runs both cases but sets the
|
|
||||||
// mask appropriately. The first case is handled by the
|
|
||||||
// IfStmt::emitCoherentTests() call, and the second is handled by
|
|
||||||
// IfStmt::emitMaskedTrueAndFalse().
|
|
||||||
llvm::Value *testValue = test->GetValue(ctx);
|
|
||||||
if (testValue) {
|
|
||||||
if (doCoherentCheck)
|
|
||||||
emitCoherentTests(ctx, testValue);
|
|
||||||
else {
|
|
||||||
llvm::Value *oldMask = ctx->GetMask();
|
|
||||||
ctx->StartVaryingIf(oldMask);
|
|
||||||
emitMaskedTrueAndFalse(ctx, oldMask, testValue);
|
|
||||||
ctx->EndIf();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -535,9 +538,17 @@ Stmt *IfStmt::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
IfStmt::EstimateCost() const {
|
||||||
|
return ((test ? test->EstimateCost() : 0) +
|
||||||
|
(trueStmts ? trueStmts->EstimateCost() : 0) +
|
||||||
|
(falseStmts ? falseStmts->EstimateCost() : 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
IfStmt::Print(int indent) const {
|
IfStmt::Print(int indent) const {
|
||||||
printf("%*cIf Stmt %s", indent, ' ', doCoherentCheck ? "DO COHERENT CHECK" : "");
|
printf("%*cIf Stmt %s", indent, ' ', doAllCheck ? "DO ALL CHECK" : "");
|
||||||
pos.Print();
|
pos.Print();
|
||||||
printf("\n%*cTest: ", indent+4, ' ');
|
printf("\n%*cTest: ", indent+4, ' ');
|
||||||
test->Print();
|
test->Print();
|
||||||
@@ -554,7 +565,7 @@ IfStmt::Print(int indent) const {
|
|||||||
|
|
||||||
|
|
||||||
/** Emit code to run both the true and false statements for the if test,
|
/** Emit code to run both the true and false statements for the if test,
|
||||||
with the mask set appropriately before runnign each one.
|
with the mask set appropriately before running each one.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
||||||
@@ -574,11 +585,185 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Similar to the Stmt variant of this function, this conservatively
|
||||||
|
checks to see if it's safe to run the code for the given Expr even if
|
||||||
|
the mask is 'all off'.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
lSafeToRunWithAllLanesOff(Expr *expr) {
|
||||||
|
if (expr == NULL)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
UnaryExpr *ue;
|
||||||
|
if ((ue = dynamic_cast<UnaryExpr *>(expr)) != NULL)
|
||||||
|
return lSafeToRunWithAllLanesOff(ue->expr);
|
||||||
|
|
||||||
|
BinaryExpr *be;
|
||||||
|
if ((be = dynamic_cast<BinaryExpr *>(expr)) != NULL)
|
||||||
|
return (lSafeToRunWithAllLanesOff(be->arg0) &&
|
||||||
|
lSafeToRunWithAllLanesOff(be->arg1));
|
||||||
|
|
||||||
|
AssignExpr *ae;
|
||||||
|
if ((ae = dynamic_cast<AssignExpr *>(expr)) != NULL)
|
||||||
|
return (lSafeToRunWithAllLanesOff(ae->lvalue) &&
|
||||||
|
lSafeToRunWithAllLanesOff(ae->rvalue));
|
||||||
|
|
||||||
|
SelectExpr *se;
|
||||||
|
if ((se = dynamic_cast<SelectExpr *>(expr)) != NULL)
|
||||||
|
return (lSafeToRunWithAllLanesOff(se->test) &&
|
||||||
|
lSafeToRunWithAllLanesOff(se->expr1) &&
|
||||||
|
lSafeToRunWithAllLanesOff(se->expr2));
|
||||||
|
|
||||||
|
ExprList *el;
|
||||||
|
if ((el = dynamic_cast<ExprList *>(expr)) != NULL) {
|
||||||
|
for (unsigned int i = 0; i < el->exprs.size(); ++i)
|
||||||
|
if (!lSafeToRunWithAllLanesOff(el->exprs[i]))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
FunctionCallExpr *fce;
|
||||||
|
if ((fce = dynamic_cast<FunctionCallExpr *>(expr)) != NULL)
|
||||||
|
// FIXME: If we could somehow determine that the function being
|
||||||
|
// called was safe (and all of the args Exprs were safe, then it'd
|
||||||
|
// be nice to be able to return true here. (Consider a call to
|
||||||
|
// e.g. floatbits() in the stdlib.) Unfortunately for now we just
|
||||||
|
// have to be conservative.
|
||||||
|
return false;
|
||||||
|
|
||||||
|
IndexExpr *ie;
|
||||||
|
if ((ie = dynamic_cast<IndexExpr *>(expr)) != NULL) {
|
||||||
|
// If we can determine at compile time the size of the array/vector
|
||||||
|
// and if the indices are compile-time constants, then we may be
|
||||||
|
// able to safely run this under a predicated if statement..
|
||||||
|
if (ie->arrayOrVector == NULL)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const Type *type = ie->arrayOrVector->GetType();
|
||||||
|
ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
|
||||||
|
if (type == NULL || ce == NULL)
|
||||||
|
return false;
|
||||||
|
if (dynamic_cast<const ReferenceType *>(type) != NULL)
|
||||||
|
type = type->GetReferenceTarget();
|
||||||
|
|
||||||
|
const SequentialType *seqType =
|
||||||
|
dynamic_cast<const SequentialType *>(type);
|
||||||
|
assert(seqType != NULL);
|
||||||
|
int nElements = seqType->GetElementCount();
|
||||||
|
if (nElements == 0)
|
||||||
|
// Unsized array, so we can't be sure
|
||||||
|
return false;
|
||||||
|
|
||||||
|
int32_t indices[ISPC_MAX_NVEC];
|
||||||
|
int count = ce->AsInt32(indices);
|
||||||
|
for (int i = 0; i < count; ++i)
|
||||||
|
if (indices[i] < 0 || indices[i] >= nElements)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// All indices are in-bounds
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
MemberExpr *me;
|
||||||
|
if ((me = dynamic_cast<MemberExpr *>(expr)) != NULL)
|
||||||
|
return lSafeToRunWithAllLanesOff(me->expr);
|
||||||
|
|
||||||
|
if (dynamic_cast<ConstExpr *>(expr) != NULL)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
TypeCastExpr *tce;
|
||||||
|
if ((tce = dynamic_cast<TypeCastExpr *>(expr)) != NULL)
|
||||||
|
return lSafeToRunWithAllLanesOff(tce->expr);
|
||||||
|
|
||||||
|
ReferenceExpr *re;
|
||||||
|
if ((re = dynamic_cast<ReferenceExpr *>(expr)) != NULL)
|
||||||
|
return lSafeToRunWithAllLanesOff(re->expr);
|
||||||
|
|
||||||
|
DereferenceExpr *dre;
|
||||||
|
if ((dre = dynamic_cast<DereferenceExpr *>(expr)) != NULL)
|
||||||
|
return lSafeToRunWithAllLanesOff(dre->expr);
|
||||||
|
|
||||||
|
if (dynamic_cast<SymbolExpr *>(expr) != NULL ||
|
||||||
|
dynamic_cast<FunctionSymbolExpr *>(expr) != NULL ||
|
||||||
|
dynamic_cast<SyncExpr *>(expr) != NULL)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
FATAL("Unknown Expr type in lSafeToRunWithAllLanesOff()");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Given an arbitrary statement, this function conservatively tests to see
|
||||||
|
if it's safe to run the code for the statement even if the mask is all
|
||||||
|
off. Here we just need to determine which kind of statement we have
|
||||||
|
and recursively traverse it and/or the expressions inside of it.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
lSafeToRunWithAllLanesOff(Stmt *stmt) {
|
||||||
|
if (stmt == NULL)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
ExprStmt *es;
|
||||||
|
if ((es = dynamic_cast<ExprStmt *>(stmt)) != NULL)
|
||||||
|
return lSafeToRunWithAllLanesOff(es->expr);
|
||||||
|
|
||||||
|
DeclStmt *ds;
|
||||||
|
if ((ds = dynamic_cast<DeclStmt *>(stmt)) != NULL) {
|
||||||
|
for (unsigned int i = 0; i < ds->declaration->declarators.size(); ++i)
|
||||||
|
if (!lSafeToRunWithAllLanesOff(ds->declaration->declarators[i]->initExpr))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
IfStmt *is;
|
||||||
|
if ((is = dynamic_cast<IfStmt *>(stmt)) != NULL)
|
||||||
|
return (lSafeToRunWithAllLanesOff(is->test) &&
|
||||||
|
lSafeToRunWithAllLanesOff(is->trueStmts) &&
|
||||||
|
lSafeToRunWithAllLanesOff(is->falseStmts));
|
||||||
|
|
||||||
|
DoStmt *dos;
|
||||||
|
if ((dos = dynamic_cast<DoStmt *>(stmt)) != NULL)
|
||||||
|
return (lSafeToRunWithAllLanesOff(dos->testExpr) &&
|
||||||
|
lSafeToRunWithAllLanesOff(dos->bodyStmts));
|
||||||
|
|
||||||
|
ForStmt *fs;
|
||||||
|
if ((fs = dynamic_cast<ForStmt *>(stmt)) != NULL)
|
||||||
|
return (lSafeToRunWithAllLanesOff(fs->init) &&
|
||||||
|
lSafeToRunWithAllLanesOff(fs->test) &&
|
||||||
|
lSafeToRunWithAllLanesOff(fs->step) &&
|
||||||
|
lSafeToRunWithAllLanesOff(fs->stmts));
|
||||||
|
|
||||||
|
if (dynamic_cast<BreakStmt *>(stmt) != NULL ||
|
||||||
|
dynamic_cast<ContinueStmt *>(stmt) != NULL)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
ReturnStmt *rs;
|
||||||
|
if ((rs = dynamic_cast<ReturnStmt *>(stmt)) != NULL)
|
||||||
|
return lSafeToRunWithAllLanesOff(rs->val);
|
||||||
|
|
||||||
|
StmtList *sl;
|
||||||
|
if ((sl = dynamic_cast<StmtList *>(stmt)) != NULL) {
|
||||||
|
const std::vector<Stmt *> &sls = sl->GetStatements();
|
||||||
|
for (unsigned int i = 0; i < sls.size(); ++i)
|
||||||
|
if (!lSafeToRunWithAllLanesOff(sls[i]))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
PrintStmt *ps;
|
||||||
|
if ((ps = dynamic_cast<PrintStmt *>(stmt)) != NULL)
|
||||||
|
return lSafeToRunWithAllLanesOff(ps->values);
|
||||||
|
|
||||||
|
FATAL("Unexpected stmt type in lSafeToRunWithAllLanesOff()");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Emit code for an if test that checks the mask and the test values and
|
/** Emit code for an if test that checks the mask and the test values and
|
||||||
tries to be smart about jumping over code that doesn't need to be run.
|
tries to be smart about jumping over code that doesn't need to be run.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
|
IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
|
||||||
llvm::Value *oldMask = ctx->GetMask();
|
llvm::Value *oldMask = ctx->GetMask();
|
||||||
if (oldMask == LLVMMaskAllOn) {
|
if (oldMask == LLVMMaskAllOn) {
|
||||||
// We can tell that the mask is on statically at compile time; just
|
// We can tell that the mask is on statically at compile time; just
|
||||||
@@ -587,7 +772,7 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
|
|||||||
emitMaskAllOn(ctx, ltest, bDone);
|
emitMaskAllOn(ctx, ltest, bDone);
|
||||||
ctx->SetCurrentBasicBlock(bDone);
|
ctx->SetCurrentBasicBlock(bDone);
|
||||||
}
|
}
|
||||||
else {
|
else if (doAllCheck) {
|
||||||
// We can't tell if the mask going into the if is all on at the
|
// We can't tell if the mask going into the if is all on at the
|
||||||
// compile time. Emit code to check for this and then either run
|
// compile time. Emit code to check for this and then either run
|
||||||
// the code for the 'all on' or the 'mixed' case depending on the
|
// the code for the 'all on' or the 'mixed' case depending on the
|
||||||
@@ -619,6 +804,43 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
|
|||||||
// paths above jump to when they're done.
|
// paths above jump to when they're done.
|
||||||
ctx->SetCurrentBasicBlock(bDone);
|
ctx->SetCurrentBasicBlock(bDone);
|
||||||
}
|
}
|
||||||
|
else if (trueStmts != NULL || falseStmts != NULL) {
|
||||||
|
// If there is nothing that is potentially unsafe to run with all
|
||||||
|
// lanes off in the true and false statements and if the total
|
||||||
|
// complexity of those two is relatively simple, then we'll go
|
||||||
|
// ahead and emit straightline code that runs both sides, updating
|
||||||
|
// the mask accordingly. This is useful for efficiently compiling
|
||||||
|
// things like:
|
||||||
|
//
|
||||||
|
// if (foo) x = 0;
|
||||||
|
// else ++x;
|
||||||
|
//
|
||||||
|
// Where the overhead of checking if any of the program instances wants
|
||||||
|
// to run one side or the other is more than the actual computation.
|
||||||
|
// The lSafeToRunWithAllLanesOff() checks to make sure that we don't do this
|
||||||
|
// for potentially dangerous code like:
|
||||||
|
//
|
||||||
|
// if (index < count) array[index] = 0;
|
||||||
|
//
|
||||||
|
// where our use of blend for conditional assignments doesn't check
|
||||||
|
// for the 'all lanes' off case.
|
||||||
|
if (lSafeToRunWithAllLanesOff(trueStmts) &&
|
||||||
|
lSafeToRunWithAllLanesOff(falseStmts) &&
|
||||||
|
(((trueStmts ? trueStmts->EstimateCost() : 0) +
|
||||||
|
(falseStmts ? falseStmts->EstimateCost() : 0)) <
|
||||||
|
PREDICATE_SAFE_IF_STATEMENT_COST)) {
|
||||||
|
ctx->StartVaryingIf(oldMask);
|
||||||
|
emitMaskedTrueAndFalse(ctx, oldMask, ltest);
|
||||||
|
assert(ctx->GetCurrentBasicBlock());
|
||||||
|
ctx->EndIf();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(doAnyCheck);
|
||||||
|
llvm::BasicBlock *bDone = ctx->CreateBasicBlock("if_done");
|
||||||
|
emitMaskMixed(ctx, oldMask, ltest, bDone);
|
||||||
|
ctx->SetCurrentBasicBlock(bDone);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -677,69 +899,50 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Emits code that checks to see if for all of the lanes where the mask is
|
|
||||||
on, the test has the value true.
|
|
||||||
*/
|
|
||||||
static llvm::Value *
|
|
||||||
lTestMatchesMask(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *mask) {
|
|
||||||
llvm::Value *testAndMask = ctx->BinaryOperator(llvm::Instruction::And, test,
|
|
||||||
mask, "test&mask");
|
|
||||||
return ctx->MasksAllEqual(testAndMask, mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Emit code for an 'if' test where the lane mask is known to be mixed
|
/** Emit code for an 'if' test where the lane mask is known to be mixed
|
||||||
on/off going into it.
|
on/off going into it.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
||||||
llvm::Value *ltest, llvm::BasicBlock *bDone) const {
|
llvm::Value *ltest, llvm::BasicBlock *bDone) const {
|
||||||
// First, see if, for all of the lanes where the mask is on, if the
|
ctx->StartVaryingIf(oldMask);
|
||||||
// value of the test is on. (i.e. (test&mask) == mask). In this case,
|
llvm::BasicBlock *bNext = ctx->CreateBasicBlock("safe_if_after_true");
|
||||||
// we only need to run the 'true' case code, since the lanes where the
|
if (trueStmts != NULL) {
|
||||||
// test was false aren't supposed to be running here anyway.
|
llvm::BasicBlock *bRunTrue = ctx->CreateBasicBlock("safe_if_run_true");
|
||||||
llvm::Value *testAllEqual = lTestMatchesMask(ctx, ltest, oldMask);
|
ctx->MaskAnd(oldMask, ltest);
|
||||||
llvm::BasicBlock *bTestAll = ctx->CreateBasicBlock("cif_mixed_test_all");
|
|
||||||
llvm::BasicBlock *bTestAnyCheck = ctx->CreateBasicBlock("cif_mixed_test_any_check");
|
|
||||||
ctx->BranchInst(bTestAll, bTestAnyCheck, testAllEqual);
|
|
||||||
|
|
||||||
// Emit code for the (test&mask)==mask case. Not only do we only need
|
// Do any of the program instances want to run the 'true'
|
||||||
// to emit code for the true statements, but we don't need to modify
|
// block? If not, jump ahead to bNext.
|
||||||
// the mask's value; it's already correct.
|
llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
|
||||||
ctx->SetCurrentBasicBlock(bTestAll);
|
ctx->BranchInst(bRunTrue, bNext, maskAnyQ);
|
||||||
ctx->StartVaryingIf(ctx->GetMask());
|
|
||||||
lEmitIfStatements(ctx, trueStmts, "cif: all running lanes want just true stmts");
|
// Emit statements for true
|
||||||
|
ctx->SetCurrentBasicBlock(bRunTrue);
|
||||||
|
lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
|
||||||
assert(ctx->GetCurrentBasicBlock());
|
assert(ctx->GetCurrentBasicBlock());
|
||||||
ctx->EndIf();
|
ctx->BranchInst(bNext);
|
||||||
ctx->BranchInst(bDone);
|
ctx->SetCurrentBasicBlock(bNext);
|
||||||
|
}
|
||||||
|
if (falseStmts != NULL) {
|
||||||
|
llvm::BasicBlock *bRunFalse = ctx->CreateBasicBlock("safe_if_run_false");
|
||||||
|
bNext = ctx->CreateBasicBlock("safe_if_after_false");
|
||||||
|
ctx->MaskAndNot(oldMask, ltest);
|
||||||
|
|
||||||
// Next, see if the active lanes only need to run the false case--i.e. if
|
// Similarly, check to see if any of the instances want to
|
||||||
// (~test & mask) == mask.
|
// run the 'false' block...
|
||||||
ctx->SetCurrentBasicBlock(bTestAnyCheck);
|
llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
|
||||||
llvm::Value *notTest = ctx->BinaryOperator(llvm::Instruction::Xor, LLVMMaskAllOn,
|
ctx->BranchInst(bRunFalse, bNext, maskAnyQ);
|
||||||
ltest, "~test");
|
|
||||||
llvm::Value *notMatchesMask = lTestMatchesMask(ctx, notTest, oldMask);
|
|
||||||
llvm::BasicBlock *bTestAllNot = ctx->CreateBasicBlock("cif_mixed_test_none");
|
|
||||||
llvm::BasicBlock *bTestMixed = ctx->CreateBasicBlock("cif_mixed_test_mixed");
|
|
||||||
ctx->BranchInst(bTestAllNot, bTestMixed, notMatchesMask);
|
|
||||||
|
|
||||||
// Emit code for the (~test & mask) == mask case. We only need the
|
// Emit code for false
|
||||||
// 'false' statements and again don't need to modify the value of the
|
ctx->SetCurrentBasicBlock(bRunFalse);
|
||||||
// mask.
|
lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
|
||||||
ctx->SetCurrentBasicBlock(bTestAllNot);
|
|
||||||
ctx->StartVaryingIf(ctx->GetMask());
|
|
||||||
lEmitIfStatements(ctx, falseStmts, "cif: all running lanes want just false stmts");
|
|
||||||
assert(ctx->GetCurrentBasicBlock());
|
assert(ctx->GetCurrentBasicBlock());
|
||||||
ctx->EndIf();
|
ctx->BranchInst(bNext);
|
||||||
|
ctx->SetCurrentBasicBlock(bNext);
|
||||||
|
}
|
||||||
ctx->BranchInst(bDone);
|
ctx->BranchInst(bDone);
|
||||||
|
ctx->SetCurrentBasicBlock(bDone);
|
||||||
// It's mixed; we need to run both the true and false cases and also do
|
|
||||||
// mask update stuff.
|
|
||||||
ctx->SetCurrentBasicBlock(bTestMixed);
|
|
||||||
ctx->StartVaryingIf(ctx->GetMask());
|
|
||||||
emitMaskedTrueAndFalse(ctx, oldMask, ltest);
|
|
||||||
ctx->EndIf();
|
ctx->EndIf();
|
||||||
ctx->BranchInst(bDone);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -955,6 +1158,13 @@ DoStmt::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
DoStmt::EstimateCost() const {
|
||||||
|
return ((testExpr ? testExpr->EstimateCost() : 0) +
|
||||||
|
(bodyStmts ? bodyStmts->EstimateCost() : 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
DoStmt::Print(int indent) const {
|
DoStmt::Print(int indent) const {
|
||||||
printf("%*cDo Stmt", indent, ' ');
|
printf("%*cDo Stmt", indent, ' ');
|
||||||
@@ -1162,6 +1372,20 @@ ForStmt::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
ForStmt::EstimateCost() const {
|
||||||
|
bool uniformTest = test ? test->GetType()->IsUniformType() :
|
||||||
|
(!g->opt.disableUniformControlFlow &&
|
||||||
|
!lHasVaryingBreakOrContinue(stmts));
|
||||||
|
|
||||||
|
return ((init ? init->EstimateCost() : 0) +
|
||||||
|
(test ? test->EstimateCost() : 0) +
|
||||||
|
(step ? step->EstimateCost() : 0) +
|
||||||
|
(stmts ? stmts->EstimateCost() : 0) +
|
||||||
|
(uniformTest ? COST_UNIFORM_LOOP : COST_VARYING_LOOP));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ForStmt::Print(int indent) const {
|
ForStmt::Print(int indent) const {
|
||||||
printf("%*cFor Stmt", indent, ' ');
|
printf("%*cFor Stmt", indent, ' ');
|
||||||
@@ -1216,6 +1440,13 @@ BreakStmt::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
BreakStmt::EstimateCost() const {
|
||||||
|
return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE :
|
||||||
|
COST_REGULAR_BREAK_CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
BreakStmt::Print(int indent) const {
|
BreakStmt::Print(int indent) const {
|
||||||
printf("%*c%sBreak Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
|
printf("%*c%sBreak Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
|
||||||
@@ -1254,6 +1485,13 @@ ContinueStmt::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
ContinueStmt::EstimateCost() const {
|
||||||
|
return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE :
|
||||||
|
COST_REGULAR_BREAK_CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ContinueStmt::Print(int indent) const {
|
ContinueStmt::Print(int indent) const {
|
||||||
printf("%*c%sContinue Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
|
printf("%*c%sContinue Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
|
||||||
@@ -1300,6 +1538,12 @@ ReturnStmt::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
ReturnStmt::EstimateCost() const {
|
||||||
|
return COST_RETURN + (val ? val->EstimateCost() : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ReturnStmt::Print(int indent) const {
|
ReturnStmt::Print(int indent) const {
|
||||||
printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
|
printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
|
||||||
@@ -1345,6 +1589,16 @@ StmtList::TypeCheck() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
StmtList::EstimateCost() const {
|
||||||
|
int cost = 0;
|
||||||
|
for (unsigned int i = 0; i < stmts.size(); ++i)
|
||||||
|
if (stmts[i])
|
||||||
|
cost += stmts[i]->EstimateCost();
|
||||||
|
return cost;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
StmtList::Print(int indent) const {
|
StmtList::Print(int indent) const {
|
||||||
printf("%*cStmt List", indent, ' ');
|
printf("%*cStmt List", indent, ' ');
|
||||||
@@ -1464,8 +1718,11 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
llvm::Value *args[5];
|
llvm::Value *args[5];
|
||||||
std::string argTypes;
|
std::string argTypes;
|
||||||
|
|
||||||
if (values == NULL)
|
if (values == NULL) {
|
||||||
args[4] = NULL;
|
LLVM_TYPE_CONST llvm::Type *ptrPtrType =
|
||||||
|
llvm::PointerType::get(LLVMTypes::VoidPointerType, 0);
|
||||||
|
args[4] = llvm::Constant::getNullValue(ptrPtrType);
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
// Get the values passed to the print() statement evaluated and
|
// Get the values passed to the print() statement evaluated and
|
||||||
// stored in memory so that we set up the array of pointers to them
|
// stored in memory so that we set up the array of pointers to them
|
||||||
@@ -1542,3 +1799,11 @@ PrintStmt::TypeCheck() {
|
|||||||
values = values->TypeCheck();
|
values = values->TypeCheck();
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
PrintStmt::EstimateCost() const {
|
||||||
|
return COST_FUNCALL + (values ? values->EstimateCost() : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
23
stmt.h
23
stmt.h
@@ -75,8 +75,8 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Expr *expr;
|
Expr *expr;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -92,8 +92,8 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Declaration *declaration;
|
Declaration *declaration;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -103,13 +103,14 @@ private:
|
|||||||
class IfStmt : public Stmt {
|
class IfStmt : public Stmt {
|
||||||
public:
|
public:
|
||||||
IfStmt(Expr *testExpr, Stmt *trueStmts, Stmt *falseStmts,
|
IfStmt(Expr *testExpr, Stmt *trueStmts, Stmt *falseStmts,
|
||||||
bool doCoherentCheck, SourcePos pos);
|
bool doAllCheck, SourcePos pos);
|
||||||
|
|
||||||
void EmitCode(FunctionEmitContext *ctx) const;
|
void EmitCode(FunctionEmitContext *ctx) const;
|
||||||
void Print(int indent) const;
|
void Print(int indent) const;
|
||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
// @todo these are only public for lHasVaryingBreakOrContinue(); would
|
// @todo these are only public for lHasVaryingBreakOrContinue(); would
|
||||||
// be nice to clean that up...
|
// be nice to clean that up...
|
||||||
@@ -125,11 +126,12 @@ private:
|
|||||||
source and thus, if the emitted code should check to see if all
|
source and thus, if the emitted code should check to see if all
|
||||||
active program instances want to follow just one of the 'true' or
|
active program instances want to follow just one of the 'true' or
|
||||||
'false' blocks. */
|
'false' blocks. */
|
||||||
const bool doCoherentCheck;
|
const bool doAllCheck;
|
||||||
|
const bool doAnyCheck;
|
||||||
|
|
||||||
void emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
void emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
||||||
llvm::Value *test) const;
|
llvm::Value *test) const;
|
||||||
void emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *test) const;
|
void emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *test) const;
|
||||||
void emitMaskAllOn(FunctionEmitContext *ctx,
|
void emitMaskAllOn(FunctionEmitContext *ctx,
|
||||||
llvm::Value *test, llvm::BasicBlock *bDone) const;
|
llvm::Value *test, llvm::BasicBlock *bDone) const;
|
||||||
void emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
void emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
||||||
@@ -150,8 +152,8 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Expr *testExpr;
|
Expr *testExpr;
|
||||||
Stmt *bodyStmts;
|
Stmt *bodyStmts;
|
||||||
const bool doCoherentCheck;
|
const bool doCoherentCheck;
|
||||||
@@ -171,8 +173,8 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
/** 'for' statment initializer; may be NULL, indicating no intitializer */
|
/** 'for' statment initializer; may be NULL, indicating no intitializer */
|
||||||
Stmt *init;
|
Stmt *init;
|
||||||
/** expression that returns a value indicating whether the loop should
|
/** expression that returns a value indicating whether the loop should
|
||||||
@@ -198,6 +200,7 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/** This indicates whether the generated code will check to see if no
|
/** This indicates whether the generated code will check to see if no
|
||||||
@@ -219,6 +222,7 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/** This indicates whether the generated code will check to see if no
|
/** This indicates whether the generated code will check to see if no
|
||||||
@@ -240,8 +244,8 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
Expr *val;
|
Expr *val;
|
||||||
/** This indicates whether the generated code will check to see if no
|
/** This indicates whether the generated code will check to see if no
|
||||||
more program instances are currently running after the return, in
|
more program instances are currently running after the return, in
|
||||||
@@ -262,6 +266,7 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
void Add(Stmt *s) { if (s) stmts.push_back(s); }
|
void Add(Stmt *s) { if (s) stmts.push_back(s); }
|
||||||
const std::vector<Stmt *> &GetStatements() { return stmts; }
|
const std::vector<Stmt *> &GetStatements() { return stmts; }
|
||||||
@@ -289,8 +294,8 @@ public:
|
|||||||
|
|
||||||
Stmt *Optimize();
|
Stmt *Optimize();
|
||||||
Stmt *TypeCheck();
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
private:
|
|
||||||
/** Format string for the print() statement. */
|
/** Format string for the print() statement. */
|
||||||
const std::string format;
|
const std::string format;
|
||||||
/** This holds the arguments passed to the print() statement. If more
|
/** This holds the arguments passed to the print() statement. If more
|
||||||
|
|||||||
154
test_static.cpp
Normal file
154
test_static.cpp
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
extern int width();
|
||||||
|
extern void f_v(float *result);
|
||||||
|
extern void f_f(float *result, float *a);
|
||||||
|
extern void f_fu(float *result, float *a, float b);
|
||||||
|
extern void f_fi(float *result, float *a, int *b);
|
||||||
|
extern void f_du(float *result, double *a, double b);
|
||||||
|
extern void f_duf(float *result, double *a, float b);
|
||||||
|
extern void f_di(float *result, double *a, int *b);
|
||||||
|
extern void result(float *val);
|
||||||
|
|
||||||
|
void ISPCLaunch(void **handlePtr, void *f, void *d, int);
|
||||||
|
void ISPCSync(void *handle);
|
||||||
|
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ISPCLaunch(void **handle, void *f, void *d, int count) {
|
||||||
|
*handle = (void *)0xdeadbeef;
|
||||||
|
typedef void (*TaskFuncType)(void *, int, int, int, int);
|
||||||
|
TaskFuncType func = (TaskFuncType)f;
|
||||||
|
for (int i = 0; i < count; ++i)
|
||||||
|
func(d, 0, 1, i, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ISPCSync(void *) {
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
|
||||||
|
*handle = (void *)0xdeadbeef;
|
||||||
|
// and now, we leak...
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
return _aligned_malloc(size, alignment);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
return memalign(alignment, size);
|
||||||
|
#endif
|
||||||
|
#ifdef ISPC_IS_APPLE
|
||||||
|
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||||
|
char *amem = ((char*)mem) + sizeof(void*);
|
||||||
|
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||||
|
(alignment - 1)));
|
||||||
|
((void**)amem)[-1] = mem;
|
||||||
|
return amem;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
int w = width();
|
||||||
|
assert(w <= 16);
|
||||||
|
|
||||||
|
float returned_result[16];
|
||||||
|
memset(returned_result, 0, 16*sizeof(float));
|
||||||
|
float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
||||||
|
double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
||||||
|
int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
|
||||||
|
int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
|
||||||
|
float b = 5.;
|
||||||
|
|
||||||
|
#if (TEST_SIG == 0)
|
||||||
|
f_v(returned_result);
|
||||||
|
#elif (TEST_SIG == 1)
|
||||||
|
f_f(returned_result, vfloat);
|
||||||
|
#elif (TEST_SIG == 2)
|
||||||
|
f_fu(returned_result, vfloat, b);
|
||||||
|
#elif (TEST_SIG == 3)
|
||||||
|
f_fi(returned_result, vfloat, vint);
|
||||||
|
#elif (TEST_SIG == 4)
|
||||||
|
f_du(returned_result, vdouble, 5.);
|
||||||
|
#elif (TEST_SIG == 5)
|
||||||
|
f_duf(returned_result, vdouble, 5.f);
|
||||||
|
#elif (TEST_SIG == 6)
|
||||||
|
f_di(returned_result, vdouble, vint2);
|
||||||
|
#else
|
||||||
|
#error "Unknown or unset TEST_SIG value"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float expected_result[16];
|
||||||
|
memset(expected_result, 0, 16*sizeof(float));
|
||||||
|
result(expected_result);
|
||||||
|
|
||||||
|
int errors = 0;
|
||||||
|
for (int i = 0; i < w; ++i) {
|
||||||
|
if (returned_result[i] != expected_result[i]) {
|
||||||
|
#ifdef EXPECT_FAILURE
|
||||||
|
// bingo, failed
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
|
||||||
|
argv[0], i, returned_result[i], returned_result[i],
|
||||||
|
expected_result[i], expected_result[i]);
|
||||||
|
++errors;
|
||||||
|
#endif // EXPECT_FAILURE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef EXPECT_FAILURE
|
||||||
|
// Don't expect to get here
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
return errors > 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
static float x[2][1];
|
static float x[1][2];
|
||||||
|
|
||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[4]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[programIndex] = 0;
|
RET[programIndex] = 0;
|
||||||
RET[3] = 4;
|
RET[3] = 4;
|
||||||
RET[4] = 5;
|
RET[4] = 5;
|
||||||
|
|||||||
@@ -5,7 +5,8 @@ uniform unsigned int32 s = 0;
|
|||||||
|
|
||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
float b = atomic_add_global(s, 1);
|
float delta = 1;
|
||||||
|
float b = atomic_add_global(s, delta);
|
||||||
RET[programIndex] = reduce_add(b);
|
RET[programIndex] = reduce_add(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
17
tests/atomics-10.ispc
Normal file
17
tests/atomics-10.ispc
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
uniform unsigned int32 s = 0;
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
float b = 0;
|
||||||
|
float delta = 1;
|
||||||
|
if (programIndex < 2)
|
||||||
|
b = atomic_add_global(s, delta);
|
||||||
|
RET[programIndex] = s;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
RET[programIndex] = 2;
|
||||||
|
}
|
||||||
20
tests/atomics-11.ispc
Normal file
20
tests/atomics-11.ispc
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
uniform unsigned int32 s = 0;
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
float b = 0;
|
||||||
|
if (programIndex & 1)
|
||||||
|
b = atomic_add_global(s, programIndex);
|
||||||
|
RET[programIndex] = s;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
uniform int sum = 0;
|
||||||
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
|
if (i & 1)
|
||||||
|
sum += i;
|
||||||
|
RET[programIndex] = sum;
|
||||||
|
}
|
||||||
20
tests/atomics-12.ispc
Normal file
20
tests/atomics-12.ispc
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
uniform unsigned int32 s = 0;
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
float b = 0;
|
||||||
|
if (programIndex & 1)
|
||||||
|
b = atomic_or_global(s, (1 << programIndex));
|
||||||
|
RET[programIndex] = s;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
uniform int sum = 0;
|
||||||
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
|
if (i & 1)
|
||||||
|
sum += (1 << i);
|
||||||
|
RET[programIndex] = sum;
|
||||||
|
}
|
||||||
16
tests/atomics-13.ispc
Normal file
16
tests/atomics-13.ispc
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
uniform unsigned int32 s = 0;
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
float b = 0;
|
||||||
|
if (programIndex & 1)
|
||||||
|
b = atomic_or_global(s, (1 << programIndex));
|
||||||
|
RET[programIndex] = popcnt(reduce_max((int32)b));
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
RET[programIndex] = (programCount/2) - 1;
|
||||||
|
}
|
||||||
20
tests/atomics-14.ispc
Normal file
20
tests/atomics-14.ispc
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
uniform unsigned int64 s = 0xffffffffff000000;
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
float b = 0;
|
||||||
|
if (programIndex & 1)
|
||||||
|
b = atomic_or_global(s, (1 << programIndex));
|
||||||
|
RET[programIndex] = (s>>20);
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
uniform int sum = 0;
|
||||||
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
|
if (i & 1)
|
||||||
|
sum += (1 << i);
|
||||||
|
RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user