Compare commits
104 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
172794ba5f | ||
|
|
a4bb6b5520 | ||
|
|
a552927a6a | ||
|
|
2d52c732f1 | ||
|
|
25676d5643 | ||
|
|
158bd6ef9e | ||
|
|
7f662de6e3 | ||
|
|
80ca02af58 | ||
|
|
8aea4a836d | ||
|
|
922dbdec06 | ||
|
|
e230d2c9ca | ||
|
|
d0674b1706 | ||
|
|
16be1d313e | ||
|
|
0932dcd98b | ||
|
|
43a619669f | ||
|
|
59036cdf5b | ||
|
|
98a2d69e72 | ||
|
|
da0fd93315 | ||
|
|
165f90357f | ||
|
|
8ef3df57c5 | ||
|
|
96d40327d0 | ||
|
|
bba7211654 | ||
|
|
2d573acd17 | ||
|
|
654cfb4b4b | ||
|
|
65a29ec316 | ||
|
|
6b0a6c0124 | ||
|
|
213c3a9666 | ||
|
|
f0f876c3ec | ||
|
|
17e5c8b7c2 | ||
|
|
646db5aacb | ||
|
|
a535aa586b | ||
|
|
6e8af5038b | ||
|
|
7058ca1aaf | ||
|
|
ae6ee3ea46 | ||
|
|
e156651190 | ||
|
|
092d288aef | ||
|
|
409bdc0dba | ||
|
|
aef8c09019 | ||
|
|
729f522a01 | ||
|
|
96ad2265e7 | ||
|
|
5a53a43ed0 | ||
|
|
be8e121b71 | ||
|
|
f1aaf0115e | ||
|
|
6b5ee6ccc0 | ||
|
|
a1d5ea69b9 | ||
|
|
af70718eca | ||
|
|
8e5ea9c33c | ||
|
|
6e4c165c7e | ||
|
|
4d733af3c7 | ||
|
|
b8dae5cb9a | ||
|
|
6ea213ad5d | ||
|
|
126e065601 | ||
|
|
5cc750ecee | ||
|
|
92106e866e | ||
|
|
6d3e44ead7 | ||
|
|
f0d254b941 | ||
|
|
5bcc611409 | ||
|
|
24f47b300d | ||
|
|
5c810e620d | ||
|
|
c6bc8fd64f | ||
|
|
3b3015162f | ||
|
|
46ccc251c8 | ||
|
|
b0658549c5 | ||
|
|
c14c3ceba6 | ||
|
|
fac50ba454 | ||
|
|
fe7717ab67 | ||
|
|
a9540b7c18 | ||
|
|
28625eb1df | ||
|
|
c6bbfe8b54 | ||
|
|
9b7eb88b0c | ||
|
|
6ed6961958 | ||
|
|
d2d5858be1 | ||
|
|
a2940d63b4 | ||
|
|
32764e7639 | ||
|
|
bcae21dbca | ||
|
|
eb22fa6173 | ||
|
|
5f7e61f9b5 | ||
|
|
28a68e3c1f | ||
|
|
6b153566f3 | ||
|
|
214fb3197a | ||
|
|
b4068efcfb | ||
|
|
24216d841f | ||
|
|
be45beb54b | ||
|
|
cb58c78c1a | ||
|
|
86de910ecd | ||
|
|
ce7978ae74 | ||
|
|
7aec7486f8 | ||
|
|
b6d6ee6fc2 | ||
|
|
cb74346d36 | ||
|
|
2709c354d7 | ||
|
|
36063bae79 | ||
|
|
e6d6a82484 | ||
|
|
f830e21cfa | ||
|
|
ae2c24c3c1 | ||
|
|
6dfd74c74c | ||
|
|
7055888cb7 | ||
|
|
7854a71ea9 | ||
|
|
b7519d1268 | ||
|
|
f2758f0831 | ||
|
|
ff76c2334e | ||
|
|
9b6bf5dabc | ||
|
|
ab33afaea4 | ||
|
|
fab5794faf | ||
|
|
3c3cd88692 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -4,3 +4,5 @@ depend
|
||||
ispc
|
||||
ispc_test
|
||||
objs
|
||||
docs/doxygen
|
||||
docs/ispc.html
|
||||
|
||||
27
LICENSE.txt
27
LICENSE.txt
@@ -114,3 +114,30 @@ CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
ispc's code to convert to and from half-precision floats is based on James
|
||||
Tursa's code, which is covered by the following license:
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the distribution
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
43
Makefile
43
Makefile
@@ -2,15 +2,20 @@
|
||||
# ispc Makefile
|
||||
#
|
||||
|
||||
ARCH = $(shell uname)
|
||||
ARCH_OS = $(shell uname)
|
||||
ARCH_TYPE = $(shell arch)
|
||||
|
||||
CLANG=clang
|
||||
CLANG_LIBS = -lclangFrontend -lclangDriver \
|
||||
-lclangSerialization -lclangParse -lclangSema \
|
||||
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
|
||||
|
||||
LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
|
||||
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
|
||||
LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
|
||||
|
||||
BUILD_DATE=$(shell date +%Y%m%d)
|
||||
BUILD_VERSION=$(shell git log | head -1)
|
||||
BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
|
||||
|
||||
CXX=g++
|
||||
CPP=cpp
|
||||
@@ -18,10 +23,14 @@ CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
|
||||
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
|
||||
|
||||
LDFLAGS=
|
||||
ifeq ($(ARCH),Linux)
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
# try to link everything statically under Linux (including libstdc++) so
|
||||
# that the binaries we generate will be portable across distributions...
|
||||
LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
|
||||
ifeq ($(ARCH_TYPE),x86_64)
|
||||
LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
|
||||
else
|
||||
LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
|
||||
endif
|
||||
endif
|
||||
|
||||
LEX=flex
|
||||
@@ -34,17 +43,17 @@ CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
|
||||
util.cpp
|
||||
HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
STDLIB_SRC=stdlib-avx.ll stdlib-sse2.ll stdlib-sse4.ll stdlib-sse4x2.ll
|
||||
BUILTINS_SRC=builtins-avx.ll builtins-sse2.ll builtins-sse4.ll builtins-sse4x2.ll
|
||||
BISON_SRC=parse.yy
|
||||
FLEX_SRC=lex.ll
|
||||
|
||||
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdlib_ispc.o \
|
||||
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) builtins-c.o stdlib_ispc.o \
|
||||
$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
|
||||
|
||||
default: ispc ispc_test
|
||||
|
||||
.PHONY: dirs clean depend doxygen print_llvm_src
|
||||
.PRECIOUS: objs/stdlib-%.cpp
|
||||
.PRECIOUS: objs/builtins-%.cpp
|
||||
|
||||
depend: $(CXX_SRC) $(HEADERS)
|
||||
@echo Updating dependencies
|
||||
@@ -68,7 +77,7 @@ doxygen:
|
||||
|
||||
ispc: print_llvm_src dirs $(OBJS)
|
||||
@echo Creating ispc executable
|
||||
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(LLVM_LIBS)
|
||||
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(CLANG_LIBS) $(LLVM_LIBS)
|
||||
|
||||
ispc_test: dirs ispc_test.cpp
|
||||
@echo Creating ispc_test executable
|
||||
@@ -94,27 +103,25 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
$(STDLIB_SRC): stdlib.m4
|
||||
objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll
|
||||
@echo Creating C++ source from builtin definitions file $<
|
||||
@m4 builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
||||
|
||||
objs/stdlib-%.cpp: stdlib-%.ll
|
||||
@echo Creating C++ source from stdlib file $<
|
||||
@m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@
|
||||
|
||||
objs/stdlib-%.o: objs/stdlib-%.cpp
|
||||
objs/builtins-%.o: objs/builtins-%.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/stdlib-c.cpp: stdlib-c.c
|
||||
@echo Creating C++ source from stdlib file $<
|
||||
objs/builtins-c.cpp: builtins-c.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@
|
||||
|
||||
objs/stdlib-c.o: objs/stdlib-c.cpp
|
||||
objs/builtins-c.o: objs/builtins-c.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/stdlib_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $<
|
||||
@$(CPP) -DISPC=1 -DPI=3.1415926536 $< | ./stdlib2cpp.py > $@
|
||||
@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
|
||||
|
||||
objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
|
||||
@echo Compiling $<
|
||||
|
||||
@@ -9,7 +9,7 @@ length=0
|
||||
|
||||
src=str(sys.argv[1])
|
||||
|
||||
target = re.sub(".*stdlib-", "", src)
|
||||
target = re.sub(".*builtins-", "", src)
|
||||
target = re.sub("\.ll$", "", target)
|
||||
target = re.sub("\.c$", "", target)
|
||||
target = re.sub("-", "_", target)
|
||||
@@ -20,14 +20,14 @@ except IOError:
|
||||
print >> sys.stderr, "Couldn't open " + src
|
||||
sys.exit(1)
|
||||
|
||||
print "unsigned char stdlib_bitcode_" + target + "[] = {"
|
||||
print "unsigned char builtins_bitcode_" + target + "[] = {"
|
||||
for line in as_out.stdout.readlines():
|
||||
length = length + len(line)
|
||||
for c in line:
|
||||
print ord(c)
|
||||
print ", "
|
||||
print " 0 };\n\n"
|
||||
print "int stdlib_bitcode_" + target + "_length = " + str(length) + ";\n"
|
||||
print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
|
||||
|
||||
as_out.wait()
|
||||
|
||||
|
||||
754
builtins-avx.ll
Normal file
754
builtins-avx.ll
Normal file
@@ -0,0 +1,754 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 8-wide definitions
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
int64minmax(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
%call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0)
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul <8 x float> %0, %call
|
||||
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <8 x float> %call, %two_minus
|
||||
ret <8 x float> %iv_mul
|
||||
}
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round4to8double(%0, 8)
|
||||
}
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
round4to8double(%0, 9)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
round4to8double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <8 x float> %v, %is
|
||||
%v_is_is = fmul <8 x float> %v_is, %is
|
||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <8 x float> %is, %three_sub
|
||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <8 x float> %half_scale
|
||||
}
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fastmath
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||
; or, use the macro to call the 4-wide ones twice with our 8-wide
|
||||
; vectors...
|
||||
|
||||
declare <8 x float> @__svml_sin(<8 x float>)
|
||||
declare <8 x float> @__svml_cos(<8 x float>)
|
||||
declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
|
||||
declare <8 x float> @__svml_tan(<8 x float>)
|
||||
declare <8 x float> @__svml_atan(<8 x float>)
|
||||
declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
|
||||
declare <8 x float> @__svml_exp(<8 x float>)
|
||||
declare <8 x float> @__svml_log(<8 x float>)
|
||||
declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__max_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__min_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
declare <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.sd.256, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.sd.256, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
; FIXME: looks like these aren't available in LLVM?
|
||||
declare <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
declare <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.ud.256, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.ud.256, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal float ops
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v2, i32 4
|
||||
%sum = fadd float %scalar1, %scalar2
|
||||
ret float %sum
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
define internal <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i32> %0, %1
|
||||
ret <8 x i32> %s
|
||||
}
|
||||
|
||||
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%s = add i32 %0, %1
|
||||
ret i32 %s
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal double ops
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%scalar1 = extractelement <4 x double> %sum0, i32 0
|
||||
%scalar2 = extractelement <4 x double> %sum1, i32 1
|
||||
%sum = fadd double %scalar1, %scalar2
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int64 ops
|
||||
|
||||
define internal <8 x i64> @__add_varying_int64(<8 x i64>,
|
||||
<8 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i64> %0, %1
|
||||
ret <8 x i64> %s
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%s = add i64 %0, %1
|
||||
ret i64 %s
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<8 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
; no masked load instruction for i8 and i16 types??
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %mask to <8 x float>
|
||||
%floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
|
||||
%retval = bitcast <8 x float> %floatval to <8 x i32>
|
||||
ret <8 x i32> %retval
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||
|
||||
%val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
|
||||
%ptr1 = getelementptr i8 * %0, i32 32
|
||||
%val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
|
||||
|
||||
%vald = shufflevector <4 x double> %val0d, <4 x double> %val1d,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%val = bitcast <8 x double> %vald to <8 x i64>
|
||||
ret <8 x i64> %val
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
|
||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <8 x i32> * %0 to i8 *
|
||||
%val = bitcast <8 x i32> %1 to <8 x float>
|
||||
%mask = bitcast <8 x i32> %2 to <8 x float>
|
||||
call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask, <8 x float> %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <8 x i64> * %0 to i8 *
|
||||
%val = bitcast <8 x i64> %1 to <8 x double>
|
||||
|
||||
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
|
||||
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||
|
||||
%val0 = shufflevector <8 x double> %val, <8 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%val1 = shufflevector <8 x double> %val, <8 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
|
||||
%ptr1 = getelementptr i8 * %ptr, i32 32
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
|
||||
ret void
|
||||
}
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||
%oldValue = load <8 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||
%newAsFloat = bitcast <8 x i32> %1 to <8 x float>
|
||||
%blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
|
||||
<8 x float> %newAsFloat,
|
||||
<8 x float> %mask_as_float)
|
||||
%blendAsInt = bitcast <8 x float> %blend to <8 x i32>
|
||||
store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
%mask = bitcast <8 x i32> %i32mask to <8 x float>
|
||||
|
||||
; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||
; are actually bitcast <4 x i64> values
|
||||
;
|
||||
; set up the first four 64-bit values
|
||||
%old01 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%old01f = bitcast <4 x i64> %old01 to <8 x float>
|
||||
%new01 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%new01f = bitcast <4 x i64> %new01 to <8 x float>
|
||||
; compute mask--note that the indices are all doubled-up
|
||||
%mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1,
|
||||
i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend them
|
||||
%result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
|
||||
<8 x float> %new01f,
|
||||
<8 x float> %mask01)
|
||||
%result01 = bitcast <8 x float> %result01f to <4 x i64>
|
||||
|
||||
; and again
|
||||
%old23 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%old23f = bitcast <4 x i64> %old23 to <8 x float>
|
||||
%new23 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%new23f = bitcast <4 x i64> %new23 to <8 x float>
|
||||
; compute mask--note that the values are doubled-up...
|
||||
%mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5,
|
||||
i32 6, i32 6, i32 7, i32 7>
|
||||
; and blend them
|
||||
%result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
|
||||
<8 x float> %new23f,
|
||||
<8 x float> %mask23)
|
||||
%result23 = bitcast <8 x float> %result23f to <4 x i64>
|
||||
|
||||
; reconstruct the final <8 x i64> vector
|
||||
%final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x i64> %final, <8 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
@@ -31,7 +31,7 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file stdlib-c.c
|
||||
/** @file builtins-c.c
|
||||
@brief Standard library function implementations written in C.
|
||||
|
||||
This file provides C implementations of various functions that can be
|
||||
@@ -31,12 +31,12 @@
|
||||
|
||||
;; This file declares implementations of various stdlib builtins that
|
||||
;; only require SSE version 1 and 2 functionality; this file, in turn
|
||||
;; is then included by stdlib-sse2.ll and stdlib-sse4.ll to provide
|
||||
;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
|
||||
;; those definitions for them.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
int8_16(4)
|
||||
int64minmax(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
@@ -124,18 +124,19 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math mode
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -227,6 +228,54 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
@@ -279,163 +328,89 @@ define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
}
|
||||
|
||||
|
||||
define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = fadd <2 x double> %v0, %v1
|
||||
%e0 = extractelement <2 x double> %sum, i32 0
|
||||
%e1 = extractelement <2 x double> %sum, i32 1
|
||||
%m = fadd double %e0, %e1
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = add <2 x i64> %v0, %v1
|
||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||
%m = add i64 %e0, %e1
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
|
||||
per_lane(4, <4 x i32> %2, `
|
||||
; compute address for this one
|
||||
%ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <4 x i32> %1, i32 LANE
|
||||
store i32 %storeval_ID, i32 * %ptr_ID')
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
|
||||
per_lane(4, <4 x i32> %2, `
|
||||
%ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <4 x i64> %1, i32 LANE
|
||||
store i64 %storeval_ID, i64 * %ptr_ID')
|
||||
ret void
|
||||
}
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
||||
; must not load if the mask is all off; the address may be invalid
|
||||
%mm = call i32 @__movmsk(<4 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
%ptr = bitcast i8 * %0 to i32 *
|
||||
%val = load i32 * %ptr
|
||||
|
||||
%ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
|
||||
%ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
|
||||
%ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
|
||||
%ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
|
||||
ret <4 x i32> %ret3
|
||||
|
||||
skip:
|
||||
ret <4 x i32> undef
|
||||
}
|
||||
|
||||
define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
||||
; must not load if the mask is all off; the address may be invalid
|
||||
%mm = call i32 @__movmsk(<4 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
%ptr = bitcast i8 * %0 to i64 *
|
||||
%val = load i64 * %ptr
|
||||
|
||||
%ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
|
||||
%ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
|
||||
%ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
|
||||
%ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
|
||||
ret <4 x i64> %ret3
|
||||
|
||||
skip:
|
||||
ret <4 x i64> undef
|
||||
}
|
||||
|
||||
define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
||||
%mm = call i32 @__movmsk(<4 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
; if any mask lane is on, just load all of the values
|
||||
; FIXME: there is a lurking bug here if we straddle a page boundary, the
|
||||
; next page is invalid to read, but the mask bits are set so that we
|
||||
; aren't supposed to be reading those elements...
|
||||
%ptr = bitcast i8 * %0 to <4 x i32> *
|
||||
%val = load <4 x i32> * %ptr, align 4
|
||||
ret <4 x i32> %val
|
||||
|
||||
skip:
|
||||
ret <4 x i32> undef
|
||||
}
|
||||
|
||||
define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
||||
%mm = call i32 @__movmsk(<4 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
; if any mask lane is on, just load all of the values
|
||||
; FIXME: there is a lurking bug here if we straddle a page boundary, the
|
||||
; next page is invalid to read, but the mask bits are set so that we
|
||||
; aren't supposed to be reading those elements...
|
||||
%ptr = bitcast i8 * %0 to <4 x i64> *
|
||||
%val = load <4 x i64> * %ptr, align 8
|
||||
ret <4 x i64> %val
|
||||
|
||||
skip:
|
||||
ret <4 x i64> undef
|
||||
}
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
load_masked(4, i8, 8, 1)
|
||||
load_masked(4, i16, 16, 2)
|
||||
load_masked(4, i32, 32, 4)
|
||||
load_masked(4, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
@@ -37,7 +37,7 @@ stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
|
||||
; Include the various definitions of things that only require SSE1 and SSE2
|
||||
include(`stdlib-sse.ll')
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
@@ -152,6 +152,40 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare double @round(double)
|
||||
declare double @floor(double)
|
||||
declare double @ceil(double)
|
||||
|
||||
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @round)
|
||||
}
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @round(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @floor)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @floor(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @ceil)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @ceil(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
@@ -244,7 +278,15 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
|
||||
|
||||
; FIXME: this is very inefficient, loops over all 32 bits...
|
||||
|
||||
define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
|
||||
; we could use the LLVM intrinsic declare i32 @llvm.ctpop.i32(i32),
|
||||
; although that currently ends up generating a POPCNT instruction even
|
||||
; if we give --target=sse2 on the command line. We probably need to
|
||||
; pipe through the 'sse2' request to LLVM via the 'features' string
|
||||
; at codegen time... (If e.g. --cpu=penryn is also passed along, then
|
||||
; it does generate non-POPCNT code and in particular better code than
|
||||
; the below does.)
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
@@ -261,6 +303,16 @@ exit:
|
||||
ret i32 %newcount
|
||||
}
|
||||
|
||||
define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
%vec = bitcast i64 %0 to <2 x i32>
|
||||
%v0 = extractelement <2 x i32> %vec, i32 0
|
||||
%v1 = extractelement <2 x i32> %vec, i32 1
|
||||
%c0 = call i32 @__popcnt_int32(i32 %v0)
|
||||
%c1 = call i32 @__popcnt_int32(i32 %v1)
|
||||
%sum = add i32 %c0, %c1
|
||||
ret i32 %sum
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
|
||||
@@ -37,10 +37,10 @@ stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
|
||||
; Define the stuff that can be done with base SSE1/SSE2 instructions
|
||||
include(`stdlib-sse.ll')
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
@@ -106,7 +106,52 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; integer min/max
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
round2to4double(%0, 8)
|
||||
}
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
round2to4double(%0, 9)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
round2to4double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
@@ -163,11 +208,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
int8_16(8)
|
||||
int64minmax(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
@@ -127,22 +127,22 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
@@ -258,7 +258,7 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
;; int32 min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
@@ -380,92 +380,88 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
|
||||
reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
per_lane(8, <8 x i32> %2, `
|
||||
; compute address for this one
|
||||
%ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <8 x i32> %1, i32 LANE
|
||||
store i32 %storeval_ID, i32 * %ptr_ID')
|
||||
ret void
|
||||
define internal <4 x double> @__add_varying_double(<4 x double>,
|
||||
<4 x double>) nounwind readnone alwaysinline {
|
||||
%r = fadd <4 x double> %0, %1
|
||||
ret <4 x double> %r
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
per_lane(8, <8 x i32> %2, `
|
||||
; compute address for this one
|
||||
%ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <8 x i64> %1, i32 LANE
|
||||
store i64 %storeval_ID, i64 * %ptr_ID')
|
||||
ret void
|
||||
define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
%r = fadd double %0, %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define internal <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%r = add i64 %0, %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
; FIXME: I think this and the next one need to verify that the mask isn't
|
||||
; all off before doing the load!!! (See e.g. stdlib-sse.ll)
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast i8 * %0 to i32 *
|
||||
%val = load i32 * %ptr
|
||||
|
||||
%ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
|
||||
%ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
|
||||
%ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
|
||||
%ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
|
||||
%ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
|
||||
%ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
|
||||
%ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
|
||||
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
|
||||
ret <8 x i32> %ret7
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast i8 * %0 to i64 *
|
||||
%val = load i64 * %ptr
|
||||
|
||||
%ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
|
||||
%ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
|
||||
%ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
|
||||
%ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
|
||||
%ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
|
||||
%ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
|
||||
%ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
|
||||
%ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
|
||||
ret <8 x i64> %ret7
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast i8 * %0 to <8 x i32> *
|
||||
%val = load <8 x i32> * %ptr, align 4
|
||||
ret <8 x i32> %val
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast i8 * %0 to <8 x i64> *
|
||||
%val = load <8 x i64> * %ptr, align 8
|
||||
ret <8 x i64> %val
|
||||
}
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
load_masked(8, i32, 32, 4)
|
||||
load_masked(8, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
;; float rounding
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
@@ -526,16 +522,68 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round2to8double(%0, 8)
|
||||
}
|
||||
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
round2to8double(%0, 9)
|
||||
}
|
||||
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
round2to8double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
@@ -555,6 +603,13 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(8, i32, 32)
|
||||
gen_masked_store(8, i64, 64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
526
builtins.cpp
526
builtins.cpp
@@ -52,6 +52,7 @@
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/Linker.h>
|
||||
#include <llvm/Support/MemoryBuffer.h>
|
||||
#include <llvm/Bitcode/ReaderWriter.h>
|
||||
@@ -64,43 +65,88 @@ extern yy_buffer_state *yy_scan_string(const char *);
|
||||
/** Given an LLVM type, try to find the equivalent ispc type. Note that
|
||||
this is an under-constrained problem due to LLVM's type representations
|
||||
carrying less information than ispc's. (For example, LLVM doesn't
|
||||
distinguish between signed and unsigned integers in its types.)
|
||||
distinguish between signed and unsigned integers in its types.)
|
||||
|
||||
Because this function is only used for generating ispc declarations of
|
||||
functions defined in LLVM bitcode in the builtins-*.ll files, in practice
|
||||
we can get enough of what we need for the relevant cases to make things
|
||||
work, partially with the help of the intAsUnsigned parameter, which
|
||||
indicates whether LLVM integer types should be treated as being signed
|
||||
or unsigned.
|
||||
|
||||
However, because this function is only used for generating ispc
|
||||
declarations of functions defined in LLVM bitcode in the stdlib-*.ll
|
||||
files, in practice we can get enough of what we need for the relevant
|
||||
cases to make things work.
|
||||
*/
|
||||
static const Type *
|
||||
lLLVMTypeToISPCType(const llvm::Type *t) {
|
||||
lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
if (t == LLVMTypes::VoidType)
|
||||
return AtomicType::Void;
|
||||
|
||||
// uniform
|
||||
else if (t == LLVMTypes::BoolType)
|
||||
return AtomicType::UniformBool;
|
||||
else if (t == LLVMTypes::Int8Type)
|
||||
return intAsUnsigned ? AtomicType::UniformUInt8 : AtomicType::UniformInt8;
|
||||
else if (t == LLVMTypes::Int16Type)
|
||||
return intAsUnsigned ? AtomicType::UniformUInt16 : AtomicType::UniformInt16;
|
||||
else if (t == LLVMTypes::Int32Type)
|
||||
return AtomicType::UniformInt32;
|
||||
return intAsUnsigned ? AtomicType::UniformUInt32 : AtomicType::UniformInt32;
|
||||
else if (t == LLVMTypes::FloatType)
|
||||
return AtomicType::UniformFloat;
|
||||
else if (t == LLVMTypes::DoubleType)
|
||||
return AtomicType::UniformDouble;
|
||||
else if (t == LLVMTypes::Int64Type)
|
||||
return AtomicType::UniformInt64;
|
||||
return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
|
||||
|
||||
// varying
|
||||
else if (t == LLVMTypes::Int8VectorType)
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
|
||||
else if (t == LLVMTypes::Int16VectorType)
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
|
||||
else if (t == LLVMTypes::Int32VectorType)
|
||||
return AtomicType::VaryingInt32;
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32;
|
||||
else if (t == LLVMTypes::FloatVectorType)
|
||||
return AtomicType::VaryingFloat;
|
||||
else if (t == LLVMTypes::DoubleVectorType)
|
||||
return AtomicType::VaryingDouble;
|
||||
else if (t == LLVMTypes::Int64VectorType)
|
||||
return AtomicType::VaryingInt64;
|
||||
return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
|
||||
|
||||
// pointers to uniform
|
||||
else if (t == LLVMTypes::Int8PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
|
||||
AtomicType::UniformInt8, false);
|
||||
else if (t == LLVMTypes::Int16PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
|
||||
AtomicType::UniformInt16, false);
|
||||
else if (t == LLVMTypes::Int32PointerType)
|
||||
return new ReferenceType(AtomicType::UniformInt32, false);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
|
||||
AtomicType::UniformInt32, false);
|
||||
else if (t == LLVMTypes::Int64PointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
|
||||
AtomicType::UniformInt64, false);
|
||||
else if (t == LLVMTypes::FloatPointerType)
|
||||
return new ReferenceType(AtomicType::UniformFloat, false);
|
||||
else if (t == LLVMTypes::DoublePointerType)
|
||||
return new ReferenceType(AtomicType::UniformDouble, false);
|
||||
|
||||
// pointers to varying
|
||||
else if (t == LLVMTypes::Int8VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
|
||||
AtomicType::VaryingInt8, false);
|
||||
else if (t == LLVMTypes::Int16VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
|
||||
AtomicType::VaryingInt16, false);
|
||||
else if (t == LLVMTypes::Int32VectorPointerType)
|
||||
return new ReferenceType(AtomicType::VaryingInt32, false);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
|
||||
AtomicType::VaryingInt32, false);
|
||||
else if (t == LLVMTypes::Int64VectorPointerType)
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
|
||||
AtomicType::VaryingInt64, false);
|
||||
else if (t == LLVMTypes::FloatVectorPointerType)
|
||||
return new ReferenceType(AtomicType::VaryingFloat, false);
|
||||
else if (t == LLVMTypes::DoubleVectorPointerType)
|
||||
return new ReferenceType(AtomicType::VaryingDouble, false);
|
||||
|
||||
// arrays
|
||||
else if (llvm::isa<const llvm::PointerType>(t)) {
|
||||
const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
|
||||
|
||||
@@ -108,15 +154,16 @@ lLLVMTypeToISPCType(const llvm::Type *t) {
|
||||
// create the equivalent ispc type. Note that it has to be a
|
||||
// reference to an array, since ispc passes arrays to functions by
|
||||
// reference.
|
||||
//
|
||||
// FIXME: generalize this to do more than uniform int32s (that's
|
||||
// all that's necessary for the stdlib currently.)
|
||||
const llvm::ArrayType *at =
|
||||
llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
|
||||
if (at && at->getNumElements() == 0 &&
|
||||
at->getElementType() == LLVMTypes::Int32Type)
|
||||
return new ReferenceType(new ArrayType(AtomicType::UniformInt32, 0),
|
||||
if (at != NULL) {
|
||||
const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
|
||||
intAsUnsigned);
|
||||
if (eltType == NULL)
|
||||
return NULL;
|
||||
return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@@ -135,26 +182,73 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
const llvm::FunctionType *ftype = func->getFunctionType();
|
||||
std::string name = func->getName();
|
||||
|
||||
const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType());
|
||||
if (!returnType)
|
||||
// return type not representable in ispc -> not callable from ispc
|
||||
if (name.size() < 3 || name[0] != '_' || name[1] != '_')
|
||||
return false;
|
||||
|
||||
// Iterate over the arguments and try to find their equivalent ispc
|
||||
// types.
|
||||
std::vector<const Type *> argTypes;
|
||||
for (unsigned int i = 0; i < ftype->getNumParams(); ++i) {
|
||||
const llvm::Type *llvmArgType = ftype->getParamType(i);
|
||||
const Type *type = lLLVMTypeToISPCType(llvmArgType);
|
||||
if (type == NULL)
|
||||
return false;
|
||||
argTypes.push_back(type);
|
||||
// An unfortunate hack: we want this builtin function to have the
|
||||
// signature "int __sext_varying_bool(bool)", but the ispc function
|
||||
// symbol creation code below assumes that any LLVM vector of i32s is a
|
||||
// varying int32. Here, we need that to be interpreted as a varying
|
||||
// bool, so just have a one-off override for that one...
|
||||
if (name == "__sext_varying_bool") {
|
||||
const Type *returnType = AtomicType::VaryingInt32;
|
||||
std::vector<const Type *> argTypes;
|
||||
argTypes.push_back(AtomicType::VaryingBool);
|
||||
std::vector<ConstExpr *> defaults;
|
||||
defaults.push_back(NULL);
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
funcType->SetArgumentDefaults(defaults);
|
||||
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
symbolTable->AddFunction(sym);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the function has any parameters with integer types, we'll make
|
||||
// two Symbols for two overloaded versions of the function, one with
|
||||
// all of the integer types treated as signed integers and one with all
|
||||
// of them treated as unsigned.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
bool intAsUnsigned = (i == 1);
|
||||
|
||||
const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
|
||||
intAsUnsigned);
|
||||
if (!returnType)
|
||||
// return type not representable in ispc -> not callable from ispc
|
||||
return false;
|
||||
|
||||
// Iterate over the arguments and try to find their equivalent ispc
|
||||
// types. Track if any of the arguments has an integer type.
|
||||
bool anyIntArgs = false;
|
||||
std::vector<const Type *> argTypes;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
|
||||
const llvm::Type *llvmArgType = ftype->getParamType(j);
|
||||
const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
|
||||
if (type == NULL)
|
||||
return false;
|
||||
anyIntArgs |=
|
||||
(Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
|
||||
argTypes.push_back(type);
|
||||
}
|
||||
|
||||
// Always create the symbol the first time through, in particular
|
||||
// so that we get symbols for things with no integer types!
|
||||
if (i == 0 || anyIntArgs == true) {
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
// set NULL default arguments
|
||||
std::vector<ConstExpr *> defaults;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
|
||||
defaults.push_back(NULL);
|
||||
funcType->SetArgumentDefaults(defaults);
|
||||
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
symbolTable->AddFunction(sym);
|
||||
}
|
||||
}
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
symbolTable->AddFunction(sym);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -176,227 +270,32 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
}
|
||||
}
|
||||
|
||||
/** Declare the function symbol 'bool __is_compile_time_constant_mask(mask type)'.
|
||||
This function will never be defined; it's just a placeholder
|
||||
that will be handled during the optimization process. See the
|
||||
discussion of the implementation of CompileTimeConstantResolvePass for
|
||||
more details.
|
||||
*/
|
||||
static void
|
||||
lDeclareCompileTimeConstant(llvm::Module *module) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::BoolType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__is_compile_time_constant_mask", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
|
||||
/** Declare the 'pseudo-gather' functions. When the ispc front-end needs
|
||||
to perform a gather, it generates a call to one of these functions,
|
||||
which have signatures:
|
||||
|
||||
varying int32 __pseudo_gather(varying int32 *, mask)
|
||||
varying int64 __pseudo_gather(varying int64 *, mask)
|
||||
|
||||
These functions are never actually implemented; the
|
||||
GatherScatterFlattenOpt optimization pass finds them and then converts
|
||||
them to make calls to the following functions, which represent gathers
|
||||
from a common base pointer with offsets. This approach allows the
|
||||
front-end to be relatively simple in how it emits address calculation
|
||||
for gathers.
|
||||
|
||||
varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base,
|
||||
int32 offsets, mask)
|
||||
varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base,
|
||||
int64 offsets, mask)
|
||||
|
||||
Then, the GSImprovementsPass optimizations finds these and either
|
||||
converts them to native gather functions or converts them to vector
|
||||
loads, if equivalent.
|
||||
*/
|
||||
static void
|
||||
lDeclarePseudoGathers(llvm::Module *module) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerVectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_gather_32", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
|
||||
fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
|
||||
func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_gather_64", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_gather_base_offsets_32", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
|
||||
fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
|
||||
func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_gather_base_offsets_64", module);
|
||||
func->setOnlyReadsMemory(true);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Similarly to the 'pseudo-gathers' defined by lDeclarePseudoGathers(),
|
||||
we also declare (but never define) pseudo-scatter instructions with
|
||||
signatures:
|
||||
|
||||
void __pseudo_scatter_32(varying int32 *, varying int32 values, mask)
|
||||
void __pseudo_scatter_64(varying int64 *, varying int64 values, mask)
|
||||
|
||||
The GatherScatterFlattenOpt optimization pass also finds these and
|
||||
transforms them to scatters like:
|
||||
|
||||
void __pseudo_scatter_base_offsets_32(uniform int32 *base,
|
||||
varying int32 offsets, varying int32 values, mask)
|
||||
void __pseudo_scatter_base_offsets_64(uniform int64 *base,
|
||||
varying int62 offsets, varying int64 values, mask)
|
||||
|
||||
And the GSImprovementsPass in turn converts these to actual native
|
||||
scatters or masked stores.
|
||||
/** In many of the builtins-*.ll files, we have declarations of various LLVM
|
||||
intrinsics that are then used in the implementation of various target-
|
||||
specific functions. This function loops over all of the intrinsic
|
||||
declarations and makes sure that the signature we have in our .ll file
|
||||
matches the signature of the actual intrinsic.
|
||||
*/
|
||||
static void
|
||||
lDeclarePseudoScatters(llvm::Module *module) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
llvm::Module::iterator iter;
|
||||
for (iter = module->begin(); iter != module->end(); ++iter) {
|
||||
llvm::Function *func = iter;
|
||||
if (!func->isIntrinsic())
|
||||
continue;
|
||||
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerVectorType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_scatter_32", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerVectorType);
|
||||
argTypes.push_back(LLVMTypes::Int64VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_scatter_64", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_scatter_base_offsets_32", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::Int64VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_scatter_base_offsets_64", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** This function declares placeholder masked store functions for the
|
||||
front-end to use.
|
||||
|
||||
void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
|
||||
void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
|
||||
|
||||
These in turn are converted to native masked stores or to regular
|
||||
stores (if the mask is all on) by the MaskedStoreOptPass optimization
|
||||
pass.
|
||||
*/
|
||||
static void
|
||||
lDeclarePseudoMaskedStore(llvm::Module *module) {
|
||||
SourcePos noPos;
|
||||
noPos.name = "__stdlib";
|
||||
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::Int32VectorPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int32VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_masked_store_32", module);
|
||||
func->setDoesNotThrow(true);
|
||||
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
func->setDoesNotCapture(1, true);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::Int64VectorPointerType);
|
||||
argTypes.push_back(LLVMTypes::Int64VectorType);
|
||||
argTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
llvm::FunctionType *fType =
|
||||
llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
|
||||
"__pseudo_masked_store_64", module);
|
||||
func->setDoesNotThrow(true);
|
||||
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
func->setDoesNotCapture(1, true);
|
||||
const std::string funcName = func->getName().str();
|
||||
// Work around http://llvm.org/bugs/show_bug.cgi?id=10438; only
|
||||
// check the llvm.x86.* intrinsics for now...
|
||||
if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
|
||||
llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
|
||||
assert(id != 0);
|
||||
LLVM_TYPE_CONST llvm::Type *intrinsicType =
|
||||
llvm::Intrinsic::getType(*g->ctx, id);
|
||||
intrinsicType = llvm::PointerType::get(intrinsicType, 0);
|
||||
assert(func->getType() == intrinsicType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -424,6 +323,7 @@ lAddBitcode(const unsigned char *bitcode, int length,
|
||||
if (llvm::Linker::LinkModules(module, bcModule, &linkError))
|
||||
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
||||
lAddModuleSymbols(module, symbolTable);
|
||||
lCheckModuleIntrinsics(module);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -437,7 +337,7 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
|
||||
pw->isStatic = true;
|
||||
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
||||
const llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
llvm::Constant *linit = LLVMInt32(val);
|
||||
pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
@@ -457,7 +357,7 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
pi[i] = i;
|
||||
pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
|
||||
|
||||
const llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
|
||||
llvm::Constant *linit = LLVMInt32Vector(pi);
|
||||
pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true,
|
||||
llvm::GlobalValue::InternalLinkage, linit,
|
||||
@@ -469,32 +369,32 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
void
|
||||
DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||
bool includeStdlibISPC) {
|
||||
// Add the definitions from the compiled stdlib-c.c file
|
||||
extern unsigned char stdlib_bitcode_c[];
|
||||
extern int stdlib_bitcode_c_length;
|
||||
lAddBitcode(stdlib_bitcode_c, stdlib_bitcode_c_length, module, symbolTable);
|
||||
// Add the definitions from the compiled builtins-c.c file
|
||||
extern unsigned char builtins_bitcode_c[];
|
||||
extern int builtins_bitcode_c_length;
|
||||
lAddBitcode(builtins_bitcode_c, builtins_bitcode_c_length, module, symbolTable);
|
||||
|
||||
// Next, add the target's custom implementations of the various needed
|
||||
// builtin functions (e.g. __masked_store_32(), etc).
|
||||
switch (g->target.isa) {
|
||||
case Target::SSE2:
|
||||
extern unsigned char stdlib_bitcode_sse2[];
|
||||
extern int stdlib_bitcode_sse2_length;
|
||||
lAddBitcode(stdlib_bitcode_sse2, stdlib_bitcode_sse2_length, module,
|
||||
extern unsigned char builtins_bitcode_sse2[];
|
||||
extern int builtins_bitcode_sse2_length;
|
||||
lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
|
||||
symbolTable);
|
||||
break;
|
||||
case Target::SSE4:
|
||||
extern unsigned char stdlib_bitcode_sse4[];
|
||||
extern int stdlib_bitcode_sse4_length;
|
||||
extern unsigned char stdlib_bitcode_sse4x2[];
|
||||
extern int stdlib_bitcode_sse4x2_length;
|
||||
extern unsigned char builtins_bitcode_sse4[];
|
||||
extern int builtins_bitcode_sse4_length;
|
||||
extern unsigned char builtins_bitcode_sse4x2[];
|
||||
extern int builtins_bitcode_sse4x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
lAddBitcode(stdlib_bitcode_sse4, stdlib_bitcode_sse4_length,
|
||||
lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
lAddBitcode(stdlib_bitcode_sse4x2, stdlib_bitcode_sse4x2_length,
|
||||
lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
@@ -502,92 +402,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
}
|
||||
break;
|
||||
case Target::AVX:
|
||||
extern unsigned char stdlib_bitcode_avx[];
|
||||
extern int stdlib_bitcode_avx_length;
|
||||
lAddBitcode(stdlib_bitcode_avx, stdlib_bitcode_avx_length, module,
|
||||
extern unsigned char builtins_bitcode_avx[];
|
||||
extern int builtins_bitcode_avx_length;
|
||||
lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module,
|
||||
symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error");
|
||||
}
|
||||
|
||||
// Add a declaration of void *ISPCMalloc(int64_t). The user is
|
||||
// responsible for linking in a definition of this if it's needed by
|
||||
// the compiled program.
|
||||
{ std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(llvm::Type::getInt64Ty(*ctx));
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCMalloc", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Add a declaration of void ISPCFree(void *). The user is
|
||||
// responsible for linking in a definition of this if it's needed by
|
||||
// the compiled program.
|
||||
{ std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCFree", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Add a declaration of void ISPCLaunch(void *funcPtr, void *data).
|
||||
// The user is responsible for linking in a definition of this if it's
|
||||
// needed by the compiled program.
|
||||
{ std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
argTypes.push_back(LLVMTypes::VoidPointerType);
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCLaunch", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Add a declaration of void ISPCSync(). The user is responsible for
|
||||
// linking in a definition of this if it's needed by the compiled
|
||||
// program.
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCSync", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Add a declaration of void ISPCInstrument(void *, void *, int, int).
|
||||
// The user is responsible for linking in a definition of this if it's
|
||||
// needed by the compiled program.
|
||||
{
|
||||
std::vector<const llvm::Type *> argTypes;
|
||||
argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
|
||||
argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
|
||||
argTypes.push_back(LLVMTypes::Int32Type);
|
||||
argTypes.push_back(LLVMTypes::Int32Type);
|
||||
llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType,
|
||||
argTypes, false);
|
||||
llvm::Function *func =
|
||||
llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
|
||||
"ISPCInstrument", module);
|
||||
func->setDoesNotThrow(true);
|
||||
}
|
||||
|
||||
// Declare various placeholder functions that the optimizer will later
|
||||
// find and replace with something more useful.
|
||||
lDeclareCompileTimeConstant(module);
|
||||
lDeclarePseudoGathers(module);
|
||||
lDeclarePseudoScatters(module);
|
||||
lDeclarePseudoMaskedStore(module);
|
||||
|
||||
// define the 'programCount' builtin variable
|
||||
lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);
|
||||
|
||||
@@ -608,10 +431,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
|
||||
if (includeStdlibISPC) {
|
||||
// If the user wants the standard library to be included, parse the
|
||||
// serialized version of the stdlib.ispc file to get its definitions
|
||||
// added.
|
||||
extern const char *stdlib_code;
|
||||
// serialized version of the stdlib.ispc file to get its
|
||||
// definitions added. Disable emission of performance warnings for
|
||||
// now, since the user doesn't care about any of that in the stdlib
|
||||
// implementation...
|
||||
bool epf = g->emitPerfWarnings;
|
||||
g->emitPerfWarnings = false;
|
||||
extern char stdlib_code[];
|
||||
yy_scan_string(stdlib_code);
|
||||
yyparse();
|
||||
g->emitPerfWarnings = epf;
|
||||
}
|
||||
}
|
||||
|
||||
1517
builtins.m4
Normal file
1517
builtins.m4
Normal file
File diff suppressed because it is too large
Load Diff
347
ctx.cpp
347
ctx.cpp
@@ -147,7 +147,7 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
||||
if (!returnType || returnType == AtomicType::Void)
|
||||
returnValuePtr = NULL;
|
||||
else {
|
||||
const llvm::Type *ftype = returnType->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *ftype = returnType->LLVMType(g->ctx);
|
||||
returnValuePtr = AllocaInst(ftype, "return_value_memory");
|
||||
// FIXME: don't do this store???
|
||||
StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
|
||||
@@ -695,7 +695,8 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
|
||||
// Call the target-dependent movmsk function to turn the vector mask
|
||||
// into an i32 value
|
||||
std::vector<Symbol *> *mm = m->symbolTable->LookupFunction("__movmsk");
|
||||
assert(mm && mm->size() == 1);
|
||||
// There should be one with signed int signature, one unsigned int.
|
||||
assert(mm && mm->size() == 2);
|
||||
llvm::Function *fmm = (*mm)[0]->function;
|
||||
return CallInst(fmm, v, "val_movmsk");
|
||||
}
|
||||
@@ -734,11 +735,12 @@ FunctionEmitContext::CreateBasicBlock(const char *name) {
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
|
||||
const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(b->getType());
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
|
||||
if (at) {
|
||||
// If we're given an array of vectors of i1s, then do the
|
||||
// conversion for each of the elements
|
||||
const llvm::Type *boolArrayType =
|
||||
LLVM_TYPE_CONST llvm::Type *boolArrayType =
|
||||
llvm::ArrayType::get(LLVMTypes::BoolVectorType, at->getNumElements());
|
||||
llvm::Value *ret = llvm::UndefValue::get(boolArrayType);
|
||||
|
||||
@@ -756,22 +758,29 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::EmitMalloc(const llvm::Type *ty) {
|
||||
FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
|
||||
// Emit code to compute the size of the given type using a GEP with a
|
||||
// NULL base pointer, indexing one element of the given type, and
|
||||
// casting the resulting 'pointer' to an int giving its size.
|
||||
const llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
|
||||
LLVM_TYPE_CONST llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
|
||||
llvm::Value *nullPtr = llvm::Constant::getNullValue(ptrType);
|
||||
llvm::Value *index[1] = { LLVMInt32(1) };
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
|
||||
llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, arrayRef,
|
||||
"offset_ptr", bblock);
|
||||
#else
|
||||
llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, &index[0], &index[1],
|
||||
"offset_ptr", bblock);
|
||||
#endif
|
||||
AddDebugPos(poffset);
|
||||
llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
|
||||
llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
|
||||
|
||||
// And given the size, call the malloc function
|
||||
llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
|
||||
assert(fmalloc != NULL);
|
||||
llvm::Value *mem = CallInst(fmalloc, sizeOf, "raw_argmem");
|
||||
llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align),
|
||||
"raw_argmem");
|
||||
// Cast the void * back to the result pointer type
|
||||
return BitCastInst(mem, ptrType, "mem_bitcast");
|
||||
}
|
||||
@@ -795,8 +804,13 @@ lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
sConstant, s);
|
||||
llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(0) };
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
|
||||
return llvm::GetElementPtrInst::Create(sPtr, arrayRef, "sptr", bblock);
|
||||
#else
|
||||
return llvm::GetElementPtrInst::Create(sPtr, &indices[0], &indices[2],
|
||||
"sptr", bblock);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -939,15 +953,16 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
|
||||
Otherwise return zero.
|
||||
*/
|
||||
static int
|
||||
lArrayVectorWidth(const llvm::Type *t) {
|
||||
const llvm::ArrayType *arrayType = llvm::dyn_cast<const llvm::ArrayType>(t);
|
||||
lArrayVectorWidth(LLVM_TYPE_CONST llvm::Type *t) {
|
||||
LLVM_TYPE_CONST llvm::ArrayType *arrayType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(t);
|
||||
if (arrayType == NULL)
|
||||
return 0;
|
||||
|
||||
// We shouldn't be seeing arrays of anything but vectors being passed
|
||||
// to things like FunctionEmitContext::BinaryOperator() as operands
|
||||
const llvm::VectorType *vectorElementType =
|
||||
llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
|
||||
LLVM_TYPE_CONST llvm::VectorType *vectorElementType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
|
||||
assert(vectorElementType != NULL &&
|
||||
(int)vectorElementType->getNumElements() == g->target.vectorWidth);
|
||||
return (int)arrayType->getNumElements();
|
||||
@@ -964,7 +979,7 @@ FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst,
|
||||
}
|
||||
|
||||
assert(v0->getType() == v1->getType());
|
||||
const llvm::Type *type = v0->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *type = v0->getType();
|
||||
int arraySize = lArrayVectorWidth(type);
|
||||
if (arraySize == 0) {
|
||||
llvm::Instruction *bop =
|
||||
@@ -998,7 +1013,7 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
|
||||
// Similarly to BinaryOperator, do the operation on all the elements of
|
||||
// the array if we're given an array type; otherwise just do the
|
||||
// regular llvm operation.
|
||||
const llvm::Type *type = v->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *type = v->getType();
|
||||
int arraySize = lArrayVectorWidth(type);
|
||||
if (arraySize == 0) {
|
||||
llvm::Instruction *binst =
|
||||
@@ -1023,20 +1038,20 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
|
||||
// Given the llvm Type that represents an ispc VectorType, return an
|
||||
// equally-shaped type with boolean elements. (This is the type that will
|
||||
// be returned from CmpInst with ispc VectorTypes).
|
||||
static const llvm::Type *
|
||||
lGetMatchingBoolVectorType(const llvm::Type *type) {
|
||||
const llvm::ArrayType *arrayType =
|
||||
llvm::dyn_cast<const llvm::ArrayType>(type);
|
||||
static LLVM_TYPE_CONST llvm::Type *
|
||||
lGetMatchingBoolVectorType(LLVM_TYPE_CONST llvm::Type *type) {
|
||||
LLVM_TYPE_CONST llvm::ArrayType *arrayType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
|
||||
// should only be called for vector typed stuff...
|
||||
assert(arrayType != NULL);
|
||||
|
||||
const llvm::VectorType *vectorElementType =
|
||||
llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
|
||||
LLVM_TYPE_CONST llvm::VectorType *vectorElementType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
|
||||
assert(vectorElementType != NULL &&
|
||||
(int)vectorElementType->getNumElements() == g->target.vectorWidth);
|
||||
|
||||
const llvm::Type *base = llvm::VectorType::get(LLVMTypes::BoolType,
|
||||
g->target.vectorWidth);
|
||||
LLVM_TYPE_CONST llvm::Type *base =
|
||||
llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
|
||||
return llvm::ArrayType::get(base, arrayType->getNumElements());
|
||||
}
|
||||
|
||||
@@ -1052,7 +1067,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
|
||||
}
|
||||
|
||||
assert(v0->getType() == v1->getType());
|
||||
const llvm::Type *type = v0->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *type = v0->getType();
|
||||
int arraySize = lArrayVectorWidth(type);
|
||||
if (arraySize == 0) {
|
||||
llvm::Instruction *ci =
|
||||
@@ -1062,7 +1077,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
|
||||
return ci;
|
||||
}
|
||||
else {
|
||||
const llvm::Type *boolType = lGetMatchingBoolVectorType(type);
|
||||
LLVM_TYPE_CONST llvm::Type *boolType = lGetMatchingBoolVectorType(type);
|
||||
llvm::Value *ret = llvm::UndefValue::get(boolType);
|
||||
for (int i = 0; i < arraySize; ++i) {
|
||||
llvm::Value *a = ExtractInst(v0, i);
|
||||
@@ -1076,16 +1091,17 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const llvm::Type *valType = value->getType();
|
||||
const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
|
||||
LLVM_TYPE_CONST llvm::Type *valType = value->getType();
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
|
||||
// If we're bitcasting an array of pointers, we have a varying
|
||||
// lvalue; apply the corresponding bitcast to each of the
|
||||
// individual pointers and return the result array.
|
||||
@@ -1109,42 +1125,74 @@ FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
}
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Value *
|
||||
FunctionEmitContext::PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// TODO: we should probably handle the array case as in
|
||||
// e.g. BitCastInst(), but we don't currently need that functionality
|
||||
llvm::Instruction *inst =
|
||||
new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
|
||||
AddDebugPos(inst);
|
||||
return inst;
|
||||
LLVM_TYPE_CONST llvm::Type *valType = value->getType();
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
|
||||
// varying lvalue -> apply ptr to int to the individual pointers
|
||||
assert((int)at->getNumElements() == g->target.vectorWidth);
|
||||
|
||||
llvm::Value *ret =
|
||||
llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i) {
|
||||
llvm::Value *elt = ExtractInst(value, i);
|
||||
llvm::Value *p2i = PtrToIntInst(elt, type, name);
|
||||
ret = InsertInst(ret, p2i, i);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
else {
|
||||
llvm::Instruction *inst =
|
||||
new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
|
||||
AddDebugPos(inst);
|
||||
return inst;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Value *
|
||||
FunctionEmitContext::IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// TODO: we should probably handle the array case as in
|
||||
// e.g. BitCastInst(), but we don't currently need that functionality
|
||||
llvm::Instruction *inst =
|
||||
new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
|
||||
AddDebugPos(inst);
|
||||
return inst;
|
||||
LLVM_TYPE_CONST llvm::Type *valType = value->getType();
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
|
||||
if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
|
||||
// varying lvalue -> apply int to ptr to the individual pointers
|
||||
assert((int)at->getNumElements() == g->target.vectorWidth);
|
||||
|
||||
llvm::Value *ret =
|
||||
llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i) {
|
||||
llvm::Value *elt = ExtractInst(value, i);
|
||||
llvm::Value *i2p = IntToPtrInst(elt, type, name);
|
||||
ret = InsertInst(ret, i2p, i);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
else {
|
||||
llvm::Instruction *inst =
|
||||
new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
|
||||
AddDebugPos(inst);
|
||||
return inst;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
@@ -1162,7 +1210,7 @@ FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
const llvm::Type *type, const char *name) {
|
||||
LLVM_TYPE_CONST llvm::Type *type, const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
@@ -1178,7 +1226,7 @@ FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
@@ -1195,7 +1243,7 @@ FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
@@ -1212,7 +1260,7 @@ FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::ZExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
FunctionEmitContext::ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name) {
|
||||
if (value == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
@@ -1238,22 +1286,30 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
|
||||
|
||||
// FIXME: do we need need to handle the case of the first index being
|
||||
// varying? It's currently needed...
|
||||
assert(!llvm::isa<const llvm::VectorType>(index0->getType()));
|
||||
assert(!llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index0->getType()));
|
||||
|
||||
const llvm::Type *basePtrType = basePtr->getType();
|
||||
const llvm::ArrayType *baseArrayType =
|
||||
llvm::dyn_cast<const llvm::ArrayType>(basePtrType);
|
||||
LLVM_TYPE_CONST llvm::Type *basePtrType = basePtr->getType();
|
||||
LLVM_TYPE_CONST llvm::ArrayType *baseArrayType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(basePtrType);
|
||||
bool baseIsVaryingTypePointer = (baseArrayType != NULL) &&
|
||||
llvm::isa<const llvm::PointerType>(baseArrayType->getElementType());
|
||||
bool indexIsVaryingType = llvm::isa<const llvm::VectorType>(index1->getType());
|
||||
llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(baseArrayType->getElementType());
|
||||
bool indexIsVaryingType =
|
||||
llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index1->getType());
|
||||
|
||||
if (!indexIsVaryingType && !baseIsVaryingTypePointer) {
|
||||
// The easy case: both the base pointer and the indices are
|
||||
// uniform, so just emit the regular LLVM GEP instruction
|
||||
llvm::Value *indices[2] = { index0, index1 };
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
|
||||
llvm::Instruction *inst =
|
||||
llvm::GetElementPtrInst::Create(basePtr, arrayRef,
|
||||
name ? name : "gep", bblock);
|
||||
#else
|
||||
llvm::Instruction *inst =
|
||||
llvm::GetElementPtrInst::Create(basePtr, &indices[0], &indices[2],
|
||||
name ? name : "gep", bblock);
|
||||
#endif
|
||||
AddDebugPos(inst);
|
||||
return inst;
|
||||
}
|
||||
@@ -1284,9 +1340,10 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
|
||||
// This is kind of a hack: use the type from the GEP to
|
||||
// figure out the return type and the first time through,
|
||||
// create an undef value of that type here
|
||||
const llvm::PointerType *elementPtrType =
|
||||
llvm::dyn_cast<const llvm::PointerType>(eltPtr->getType());
|
||||
const llvm::Type *elementType = elementPtrType->getElementType();
|
||||
LLVM_TYPE_CONST llvm::PointerType *elementPtrType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(eltPtr->getType());
|
||||
LLVM_TYPE_CONST llvm::Type *elementType =
|
||||
elementPtrType->getElementType();
|
||||
lret = llvm::UndefValue::get(LLVMPointerVectorType(elementType));
|
||||
}
|
||||
|
||||
@@ -1313,7 +1370,7 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (llvm::isa<const llvm::PointerType>(lvalue->getType())) {
|
||||
if (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType())) {
|
||||
// If the lvalue is a straight up regular pointer, then just issue
|
||||
// a regular load. First figure out the alignment; in general we
|
||||
// can just assume the natural alignment (0 here), but for varying
|
||||
@@ -1340,7 +1397,7 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
|
||||
// information we need from the LLVM::Type, so have to carry the
|
||||
// ispc type in through this path..
|
||||
assert(type != NULL);
|
||||
assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
|
||||
assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
|
||||
return gather(lvalue, type, name);
|
||||
}
|
||||
}
|
||||
@@ -1350,19 +1407,19 @@ llvm::Value *
|
||||
FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
|
||||
const char *name) {
|
||||
// We should have a varying lvalue if we get here...
|
||||
assert(llvm::dyn_cast<const llvm::ArrayType>(lvalue->getType()));
|
||||
assert(llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
|
||||
|
||||
const llvm::Type *retType = type->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *retType = type->LLVMType(g->ctx);
|
||||
|
||||
const StructType *st = dynamic_cast<const StructType *>(type);
|
||||
if (st) {
|
||||
// If we're gathering structures, do an element-wise gather
|
||||
// recursively.
|
||||
llvm::Value *retValue = llvm::UndefValue::get(retType);
|
||||
for (int i = 0; i < st->NumElements(); ++i) {
|
||||
for (int i = 0; i < st->GetElementCount(); ++i) {
|
||||
llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
|
||||
// This in turn will be another gather
|
||||
llvm::Value *eltValues = LoadInst(eltPtrs, st->GetMemberType(i),
|
||||
llvm::Value *eltValues = LoadInst(eltPtrs, st->GetElementType(i),
|
||||
name);
|
||||
retValue = InsertInst(retValue, eltValues, i, "set_value");
|
||||
}
|
||||
@@ -1378,7 +1435,7 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
|
||||
// the GEP stuff in the loop below ends up computing pointers based
|
||||
// on elements in the vectors rather than incorrectly advancing to
|
||||
// the next vector...
|
||||
const llvm::Type *eltType =
|
||||
LLVM_TYPE_CONST llvm::Type *eltType =
|
||||
vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
|
||||
lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));
|
||||
|
||||
@@ -1409,17 +1466,20 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
|
||||
llvm::Value *mask = GetMask();
|
||||
llvm::Function *gather = NULL;
|
||||
// Figure out which gather function to call based on the size of
|
||||
// the elements; will need to generalize this for 8 and 16-bit
|
||||
// types.
|
||||
// the elements.
|
||||
if (retType == LLVMTypes::DoubleVectorType ||
|
||||
retType == LLVMTypes::Int64VectorType)
|
||||
gather = m->module->getFunction("__pseudo_gather_64");
|
||||
else {
|
||||
assert(retType == LLVMTypes::FloatVectorType ||
|
||||
retType == LLVMTypes::Int32VectorType);
|
||||
else if (retType == LLVMTypes::FloatVectorType ||
|
||||
retType == LLVMTypes::Int32VectorType)
|
||||
gather = m->module->getFunction("__pseudo_gather_32");
|
||||
else if (retType == LLVMTypes::Int16VectorType)
|
||||
gather = m->module->getFunction("__pseudo_gather_16");
|
||||
else {
|
||||
assert(retType == LLVMTypes::Int8VectorType);
|
||||
gather = m->module->getFunction("__pseudo_gather_8");
|
||||
}
|
||||
assert(gather);
|
||||
assert(gather != NULL);
|
||||
|
||||
llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
|
||||
llvm::Instruction *call = CallInst(gather, voidlvalue, mask, name);
|
||||
@@ -1467,7 +1527,7 @@ FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
|
||||
FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name,
|
||||
int align, bool atEntryBlock) {
|
||||
llvm::AllocaInst *inst = NULL;
|
||||
if (atEntryBlock) {
|
||||
@@ -1482,6 +1542,17 @@ FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
|
||||
// current basic block
|
||||
inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
|
||||
|
||||
// If no alignment was specified but we have an array of a uniform
|
||||
// type, then align it to 4 * the native vector width; it's not
|
||||
// unlikely that this array will be loaded into varying variables with
|
||||
// what will be aligned accesses if the uniform -> varying load is done
|
||||
// in regular chunks.
|
||||
LLVM_TYPE_CONST llvm::ArrayType *arrayType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(llvmType);
|
||||
if (align == 0 && arrayType != NULL &&
|
||||
!llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType()))
|
||||
align = 4 * g->target.nativeVectorWidth;
|
||||
|
||||
if (align != 0)
|
||||
inst->setAlignment(align);
|
||||
// Don't add debugging info to alloca instructions
|
||||
@@ -1504,43 +1575,31 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
return;
|
||||
}
|
||||
|
||||
assert(llvm::isa<const llvm::PointerType>(lvalue->getType()));
|
||||
assert(llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType()));
|
||||
|
||||
const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
|
||||
if (structType != NULL) {
|
||||
// Assigning a structure
|
||||
for (int i = 0; i < structType->NumElements(); ++i) {
|
||||
const CollectionType *collectionType =
|
||||
dynamic_cast<const CollectionType *>(rvalueType);
|
||||
if (collectionType != NULL) {
|
||||
// Assigning a structure / array / vector. Handle each element
|
||||
// individually with what turns into a recursive call to
|
||||
// makedStore()
|
||||
for (int i = 0; i < collectionType->GetElementCount(); ++i) {
|
||||
llvm::Value *eltValue = ExtractInst(rvalue, i, "rvalue_member");
|
||||
llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i,
|
||||
"struct_lvalue_ptr");
|
||||
StoreInst(eltValue, eltLValue, storeMask,
|
||||
structType->GetMemberType(i));
|
||||
collectionType->GetElementType(i));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const SequentialType *sequentialType =
|
||||
dynamic_cast<const SequentialType *>(rvalueType);
|
||||
if (sequentialType != NULL) {
|
||||
// Assigning arrays and vectors. Handle each element individually
|
||||
// with what turns into a recursive call to makedStore()
|
||||
for (int i = 0; i < sequentialType->GetElementCount(); ++i) {
|
||||
llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, "lval_i_ptr");
|
||||
llvm::Value *eltValue = ExtractInst(rvalue, i, "array_i_val");
|
||||
StoreInst(eltValue, eltLValue, storeMask,
|
||||
sequentialType->GetElementType());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// We must have a regular atomic type at this point
|
||||
assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);
|
||||
// We must have a regular atomic or enumerator type at this point
|
||||
assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL ||
|
||||
dynamic_cast<const EnumType *>(rvalueType) != NULL);
|
||||
rvalueType = rvalueType->GetAsNonConstType();
|
||||
|
||||
llvm::Function *maskedStoreFunc = NULL;
|
||||
// Figure out if we need a 32-bit or 64-bit masked store. This
|
||||
// will need to be generalized when/if 8 and 16-bit data types are
|
||||
// added.
|
||||
// Figure out if we need a 8, 16, 32 or 64-bit masked store.
|
||||
if (rvalueType == AtomicType::VaryingDouble ||
|
||||
rvalueType == AtomicType::VaryingInt64 ||
|
||||
rvalueType == AtomicType::VaryingUInt64) {
|
||||
@@ -1550,12 +1609,11 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType,
|
||||
"rvalue_to_int64");
|
||||
}
|
||||
else {
|
||||
assert(rvalueType == AtomicType::VaryingFloat ||
|
||||
rvalueType == AtomicType::VaryingBool ||
|
||||
rvalueType == AtomicType::VaryingInt32 ||
|
||||
rvalueType == AtomicType::VaryingUInt32);
|
||||
|
||||
else if (rvalueType == AtomicType::VaryingFloat ||
|
||||
rvalueType == AtomicType::VaryingBool ||
|
||||
rvalueType == AtomicType::VaryingInt32 ||
|
||||
rvalueType == AtomicType::VaryingUInt32 ||
|
||||
dynamic_cast<const EnumType *>(rvalueType) != NULL) {
|
||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
|
||||
lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType,
|
||||
"lvalue_to_int32vecptr");
|
||||
@@ -1563,6 +1621,18 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType,
|
||||
"rvalue_to_int32");
|
||||
}
|
||||
else if (rvalueType == AtomicType::VaryingInt16 ||
|
||||
rvalueType == AtomicType::VaryingUInt16) {
|
||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
|
||||
lvalue = BitCastInst(lvalue, LLVMTypes::Int16VectorPointerType,
|
||||
"lvalue_to_int16vecptr");
|
||||
}
|
||||
else if (rvalueType == AtomicType::VaryingInt8 ||
|
||||
rvalueType == AtomicType::VaryingUInt8) {
|
||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
|
||||
lvalue = BitCastInst(lvalue, LLVMTypes::Int8VectorPointerType,
|
||||
"lvalue_to_int8vecptr");
|
||||
}
|
||||
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(lvalue);
|
||||
@@ -1583,15 +1653,15 @@ void
|
||||
FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
llvm::Value *storeMask, const Type *rvalueType) {
|
||||
assert(rvalueType->IsVaryingType());
|
||||
assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
|
||||
assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
|
||||
|
||||
const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
|
||||
if (structType) {
|
||||
// Scatter the struct elements individually
|
||||
for (int i = 0; i < structType->NumElements(); ++i) {
|
||||
for (int i = 0; i < structType->GetElementCount(); ++i) {
|
||||
llvm::Value *lv = GetElementPtrInst(lvalue, 0, i);
|
||||
llvm::Value *rv = ExtractInst(rvalue, i);
|
||||
scatter(rv, lv, storeMask, structType->GetMemberType(i));
|
||||
scatter(rv, lv, storeMask, structType->GetElementType(i));
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -1602,7 +1672,8 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
// the GEP stuff in the loop below ends up computing pointers based
|
||||
// on elements in the vectors rather than incorrectly advancing to
|
||||
// the next vector...
|
||||
const llvm::Type *eltType = vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *eltType =
|
||||
vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
|
||||
lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));
|
||||
|
||||
for (int i = 0; i < vt->GetElementCount(); ++i) {
|
||||
@@ -1620,20 +1691,21 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);
|
||||
|
||||
llvm::Function *func = NULL;
|
||||
const llvm::Type *type = rvalue->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *type = rvalue->getType();
|
||||
if (type == LLVMTypes::DoubleVectorType ||
|
||||
type == LLVMTypes::Int64VectorType) {
|
||||
func = m->module->getFunction("__pseudo_scatter_64");
|
||||
rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int");
|
||||
}
|
||||
else {
|
||||
// FIXME: if this hits, presumably it's due to needing int8 and/or
|
||||
// int16 versions of scatter...
|
||||
assert(type == LLVMTypes::FloatVectorType ||
|
||||
type == LLVMTypes::Int32VectorType);
|
||||
else if (type == LLVMTypes::FloatVectorType ||
|
||||
type == LLVMTypes::Int32VectorType) {
|
||||
func = m->module->getFunction("__pseudo_scatter_32");
|
||||
rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int");
|
||||
}
|
||||
else if (type == LLVMTypes::Int16VectorType)
|
||||
func = m->module->getFunction("__pseudo_scatter_16");
|
||||
else if (type == LLVMTypes::Int8VectorType)
|
||||
func = m->module->getFunction("__pseudo_scatter_8");
|
||||
assert(func != NULL);
|
||||
|
||||
AddInstrumentationPoint("scatter");
|
||||
@@ -1687,7 +1759,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock);
|
||||
AddDebugPos(si);
|
||||
}
|
||||
else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
|
||||
else if (llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()))
|
||||
// We have a varying lvalue (an array of pointers), so it's time to
|
||||
// scatter
|
||||
scatter(rvalue, lvalue, storeMask, rvalueType);
|
||||
@@ -1731,7 +1803,7 @@ FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const char *name) {
|
||||
}
|
||||
|
||||
llvm::Instruction *ei = NULL;
|
||||
if (llvm::isa<const llvm::VectorType>(v->getType()))
|
||||
if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
|
||||
ei = llvm::ExtractElementInst::Create(v, LLVMInt32(elt),
|
||||
name ? name : "extract", bblock);
|
||||
else
|
||||
@@ -1751,7 +1823,7 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
}
|
||||
|
||||
llvm::Instruction *ii = NULL;
|
||||
if (llvm::isa<const llvm::VectorType>(v->getType()))
|
||||
if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
|
||||
ii = llvm::InsertElementInst::Create(v, eltVal, LLVMInt32(elt),
|
||||
name ? name : "insert", bblock);
|
||||
else
|
||||
@@ -1763,7 +1835,7 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
|
||||
|
||||
llvm::PHINode *
|
||||
FunctionEmitContext::PhiNode(const llvm::Type *type, int count,
|
||||
FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
||||
const char *name) {
|
||||
llvm::PHINode *pn = llvm::PHINode::Create(type,
|
||||
#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
|
||||
@@ -1800,9 +1872,14 @@ FunctionEmitContext::CallInst(llvm::Function *func,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, args, name ? name : "", bblock);
|
||||
#else
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, args.begin(), args.end(),
|
||||
name ? name : "", bblock);
|
||||
#endif
|
||||
AddDebugPos(ci);
|
||||
return ci;
|
||||
}
|
||||
@@ -1816,10 +1893,15 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, arg, name ? name : "", bblock);
|
||||
#else
|
||||
llvm::Value *args[] = { arg };
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, &args[0], &args[1], name ? name : "",
|
||||
bblock);
|
||||
#endif
|
||||
AddDebugPos(ci);
|
||||
return ci;
|
||||
}
|
||||
@@ -1834,9 +1916,16 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
|
||||
}
|
||||
|
||||
llvm::Value *args[] = { arg0, arg1 };
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::ArrayRef<llvm::Value *> argArrayRef(&args[0], &args[2]);
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, argArrayRef, name ? name : "",
|
||||
bblock);
|
||||
#else
|
||||
llvm::Instruction *ci =
|
||||
llvm::CallInst::Create(func, &args[0], &args[2], name ? name : "",
|
||||
bblock);
|
||||
#endif
|
||||
AddDebugPos(ci);
|
||||
return ci;
|
||||
}
|
||||
@@ -1883,20 +1972,28 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
||||
|
||||
launchedTasks = true;
|
||||
|
||||
const llvm::Type *argType = callee->arg_begin()->getType();
|
||||
LLVM_TYPE_CONST llvm::Type *argType = callee->arg_begin()->getType();
|
||||
assert(llvm::PointerType::classof(argType));
|
||||
const llvm::PointerType *pt = static_cast<const llvm::PointerType *>(argType);
|
||||
LLVM_TYPE_CONST llvm::PointerType *pt =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(argType);
|
||||
assert(llvm::StructType::classof(pt->getElementType()));
|
||||
const llvm::StructType *argStructType =
|
||||
static_cast<const llvm::StructType *>(pt->getElementType());
|
||||
LLVM_TYPE_CONST llvm::StructType *argStructType =
|
||||
static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
|
||||
assert(argStructType->getNumElements() == argVals.size() + 1);
|
||||
|
||||
// Use alloca for space for the task args. KEY DETAIL: pass false
|
||||
// to the call of FunctionEmitContext::AllocaInst so that the alloca
|
||||
// doesn't happen just once at the top of the function, but happens
|
||||
// each time the enclosing basic block executes.
|
||||
int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
// Use malloc() to allocate storage on Windows, since the stack is
|
||||
// generally not big enough there to do enough allocations for lots of
|
||||
// tasks and then things crash horribly...
|
||||
llvm::Value *argmem = EmitMalloc(argStructType, align);
|
||||
#else
|
||||
// Use alloca for space for the task args on OSX And Linux. KEY
|
||||
// DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so
|
||||
// that the alloca doesn't happen just once at the top of the function,
|
||||
// but happens each time the enclosing basic block executes.
|
||||
llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
|
||||
|
||||
// Copy the values of the parameters into the appropriate place in
|
||||
|
||||
27
ctx.h
27
ctx.h
@@ -213,7 +213,7 @@ public:
|
||||
/** Emit code to call the user-supplied ISPCMalloc function to
|
||||
allocate space for an object of thee given type. Returns the
|
||||
pointer value returned by the ISPCMalloc call. */
|
||||
llvm::Value *EmitMalloc(const llvm::Type *ty);
|
||||
llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0);
|
||||
|
||||
/** Emit code to call the user-supplied ISPCFree function, passing it
|
||||
the given pointer to storage previously allocated by an
|
||||
@@ -303,21 +303,21 @@ public:
|
||||
llvm::CmpInst::Predicate pred,
|
||||
llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
|
||||
|
||||
llvm::Value *BitCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *PtrToIntInst(llvm::Value *value, const llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *IntToPtrInst(llvm::Value *value, const llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
const llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, const llvm::Type *type,
|
||||
LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
|
||||
llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, const llvm::Type *type,
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
/** This GEP method is a generalization of the standard one in LLVM; it
|
||||
@@ -347,7 +347,7 @@ public:
|
||||
instruction is added at the start of the function in the entry
|
||||
basic block; if it should be added to the current basic block, then
|
||||
the atEntryBlock parameter should be false. */
|
||||
llvm::Value *AllocaInst(const llvm::Type *llvmType, const char *name = NULL,
|
||||
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
|
||||
int align = 0, bool atEntryBlock = true);
|
||||
|
||||
/** Standard store instruction; for this variant, the lvalue must be a
|
||||
@@ -378,7 +378,8 @@ public:
|
||||
llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::PHINode *PhiNode(const llvm::Type *type, int count, const char *name = NULL);
|
||||
llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
|
||||
llvm::Value *val1, const char *name = NULL);
|
||||
|
||||
|
||||
8
decl.cpp
8
decl.cpp
@@ -318,9 +318,10 @@ Declaration::Print() const {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void
|
||||
GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames) {
|
||||
GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions) {
|
||||
for (unsigned int i = 0; i < sd.size(); ++i) {
|
||||
const Type *type = sd[i]->type;
|
||||
// FIXME: making this fake little DeclSpecs here is really
|
||||
@@ -343,6 +344,7 @@ GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
|
||||
|
||||
elementTypes->push_back(d->sym->type);
|
||||
elementNames->push_back(d->sym->name);
|
||||
elementPositions->push_back(d->sym->pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
7
decl.h
7
decl.h
@@ -196,8 +196,9 @@ struct StructDeclaration {
|
||||
|
||||
/** Given a set of StructDeclaration instances, this returns the types of
|
||||
the elements of the corresponding struct and their names. */
|
||||
extern void GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames);
|
||||
extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions);
|
||||
|
||||
#endif // ISPC_DECL_H
|
||||
|
||||
103
docs/ReleaseNotes.txt
Normal file
103
docs/ReleaseNotes.txt
Normal file
@@ -0,0 +1,103 @@
|
||||
=== v1.0.5 === (1 August 2011)
|
||||
|
||||
Multi-element vector swizzles are supported; for example, given a 3-wide
|
||||
vector "foo", then expressions like "foo.zyx" and "foo.yz" can be used to
|
||||
construct other short vectors. See
|
||||
http://ispc.github.com/ispc.html#short-vector-types
|
||||
for more details. (Thanks to Pete Couperus for implementing this code!).
|
||||
|
||||
int8 and int16 datatypes are now supported. It is still generally more
|
||||
efficient to use int32 for intermediate computations, even if the in-memory
|
||||
format is int8 or int16.
|
||||
|
||||
There are now standard library routines to convert to and from 'half'-format
|
||||
floating-point values (half_to_float() and float_to_half()).
|
||||
|
||||
There is a new example with an implementation of Perlin's Noise function
|
||||
(examples/noise). It shows a speedup of approximately 4.2x versus a C
|
||||
implementation on OSX and a 2.9x speedup versus C on Windows.
|
||||
|
||||
=== v1.0.4 === (18 July 2011)
|
||||
|
||||
enums are now supported in ispc; see the section on enumeration types in
|
||||
the documentation (http://ispc.github.com/ispc.html#enumeration-types) for
|
||||
more informaiton.
|
||||
|
||||
bools are converted to integers with zero extension, not sign extension as
|
||||
before (i.e. a 'true' bool converts to the value one, not 'all bits on'.)
|
||||
For cases where sign extension is still desired, there is a
|
||||
sign_extend(bool) function in the standard library.
|
||||
|
||||
Support for 64-bit types in the standard library is much more complete than
|
||||
before.
|
||||
|
||||
64-bit integer constants are now supported by the parser.
|
||||
|
||||
Storage for parameters to tasks is now allocated dynamically on Windows,
|
||||
rather than on the stack; with this fix, all tests now run correctly on
|
||||
Windows.
|
||||
|
||||
There is now support for atomic swap and compare/exchange with float and
|
||||
double types.
|
||||
|
||||
A number of additional small bugs have been fixed and a number of cases
|
||||
where the compiler would crash given a malformed program have been fixed.
|
||||
|
||||
=== v1.0.3 === (4 July 2011)
|
||||
|
||||
ispc now has a bulit-in pre-processor (from LLVM's clang compiler).
|
||||
(Thanks to Pete Couperus for this patch!) It is therefore no longer
|
||||
necessary to use cl.exe for preprocessing on Windows; the MSVC proejct
|
||||
files for the examples have been updated accordingly.
|
||||
|
||||
There is another variant of the shuffle() function int the standard
|
||||
library: "<type> shuffle(<type> v0, <type> v1, int permute)", where the
|
||||
permutation vector indexes over the concatenation of the two vectors
|
||||
(e.g. the value 0 corresponds to the first element of v0, the value
|
||||
2*programCount-1 corresponds to the last element of v1, etc.)
|
||||
|
||||
ispc now supports the usual range of atomic operations (add, subtract, min,
|
||||
max, and, or, and xor) as well as atomic swap and atomic compare and
|
||||
exchange. There is also a facility for inserting memory fences. See the
|
||||
"Atomic Operations and Memory Fences" section of the user's guide
|
||||
(http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences) for
|
||||
more information.
|
||||
|
||||
There are now both 'signed' and 'unsigned' variants of the standard library
|
||||
functions like packed_load_active() that take references to arrays of
|
||||
signed int32s and unsigned int32s respectively. (The
|
||||
{load_from,store_to}_{int8,int16}() functions have similarly been augmented
|
||||
to have both 'signed' and 'unsigned' variants.)
|
||||
|
||||
In initializer expressions with variable declarations, it is no longer
|
||||
legal to initialize arrays and structs with single scalar values that then
|
||||
initialize their members; they now must be initialized with initializer
|
||||
lists in braces (or initialized after of the initializer with a loop over
|
||||
array elements, etc.)
|
||||
|
||||
=== v1.0.2 === (1 July 2011)
|
||||
|
||||
Floating-point hexidecimal constants are now parsed correctly on Windows
|
||||
(fixes issue #16).
|
||||
|
||||
SSE2 is now the default target if --cpu=atom is given in the command line
|
||||
arguments and another target isn't explicitly specified.
|
||||
|
||||
The standard library now provides broadcast(), rotate(), and shuffle()
|
||||
routines for efficient communication between program instances.
|
||||
|
||||
The MSVC solution files to build the examples on Windows now use
|
||||
/fpmath:fast when building.
|
||||
|
||||
=== v1.0.1 === (24 June 2011)
|
||||
|
||||
ispc no longer requires that pointers to memory that are passed in to ispc
|
||||
have alignment equal to the targets vector width; now alignment just has to
|
||||
be the regular element alignment (e.g. 4 bytes for floats, etc.) This
|
||||
change also fixed a number of cases where it previously incorrectly
|
||||
generated aligned load/store instructions in cases where the address wasn't
|
||||
actually aligned (even if the base address passed into ispc code was).
|
||||
|
||||
=== v1.0 === (21 June 2011)
|
||||
|
||||
Initial Release
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
rst2html ispc.txt > ispc.html
|
||||
rst2html.py ispc.txt > ispc.html
|
||||
|
||||
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
|
||||
#pdflatex ispc.tex
|
||||
|
||||
453
docs/ispc.txt
453
docs/ispc.txt
@@ -50,6 +50,7 @@ Contents:
|
||||
|
||||
+ `Lexical Structure`_
|
||||
+ `Basic Types and Type Qualifiers`_
|
||||
+ `Enumeration Types`_
|
||||
+ `Short Vector Types`_
|
||||
+ `Struct and Array Types`_
|
||||
+ `Declarations and Initializers`_
|
||||
@@ -74,7 +75,10 @@ Contents:
|
||||
|
||||
+ `Math Functions`_
|
||||
+ `Output Functions`_
|
||||
+ `Cross-Lane Operations`_
|
||||
+ `Cross-Program Instance Operations`_
|
||||
+ `Packed Load and Store Operations`_
|
||||
+ `Conversions To and From Half-Precision Floats`_
|
||||
+ `Atomic Operations and Memory Fences`_
|
||||
+ `Low-Level Bits`_
|
||||
|
||||
* `Interoperability with the Application`_
|
||||
@@ -89,6 +93,7 @@ Contents:
|
||||
+ `Understanding How to Interoperate With the Application's Data`_
|
||||
+ `Communicating Between SPMD Program Instances`_
|
||||
+ `Gather and Scatter`_
|
||||
+ `8 and 16-bit Integer Types`_
|
||||
+ `Low-level Vector Tricks`_
|
||||
+ `Debugging`_
|
||||
+ `The "Fast math" Option`_
|
||||
@@ -103,27 +108,8 @@ Contents:
|
||||
Recent Changes to ISPC
|
||||
======================
|
||||
|
||||
This section summarizes recent changes and bugfixes.
|
||||
|
||||
* 17 May: Fixed a number of bugs related to error handling in Windows*. In
|
||||
particular, if you use the ``/E`` command line flag to ``cl.exe`` (rather
|
||||
than ``/EP``) when using it as a preprocessor, then ``ispc`` will
|
||||
correctly report the source file position with warnings and errors.
|
||||
|
||||
* 15 May: Improved error messages and warnings in many cases. For example,
|
||||
the column number is reported along with the line number and
|
||||
the source line with the error is printed as part of the message.
|
||||
|
||||
* 8 May: ``ispc``'s typechecker has been substantially improved in how it
|
||||
handles ``const``-qualified types. Some programs that previously
|
||||
compiled may now fail with errors related to ``const``. For example,
|
||||
``ispc`` issues an error message if you try to assign a member of a const
|
||||
structure.
|
||||
|
||||
* 2 May: "uniform" short-vector types are now stored across the lanes of
|
||||
the SIMD registers. This enables you to also write classic 'explicit
|
||||
vector' computation in ``ispc`` as well. This change does change how
|
||||
these types are laid out in memory; see `Data Layout`_ for more details.)
|
||||
See the file ``ReleaseNotes.txt`` in the ``ispc`` distribution for a list
|
||||
of recent changes to the compiler.
|
||||
|
||||
Getting Started with ISPC
|
||||
=========================
|
||||
@@ -136,7 +122,7 @@ Linux\* and Mac OS\* available for download. Alternatively, you can
|
||||
download the source code from that page and build it yourself; see see the
|
||||
`ispc wiki`_ for instructions about building ``ispc`` from source.
|
||||
|
||||
.. _ispc downloads web page:downloads.html
|
||||
.. _ispc downloads web page: downloads.html
|
||||
.. _ispc wiki: http://github.com/ispc/ispc/wiki
|
||||
|
||||
Once you have an executable for your system, copy it into a directory
|
||||
@@ -281,19 +267,9 @@ with application code, enter the following command
|
||||
|
||||
ispc foo.ispc -o foo.o
|
||||
|
||||
On Linux\* and Mac OS\*, ``ispc`` automatically runs the C preprocessor on
|
||||
your input program; under Windows\*, this must be done manually. With
|
||||
Microsoft Visual C++ 2010\*, the following custom build step for
|
||||
``ispc`` source files takes care of this job:
|
||||
|
||||
::
|
||||
|
||||
cl /E /TP %(Filename).ispc | ispc - -o %(Filename).obj -h %(Filename).h
|
||||
|
||||
The ``cl`` call runs the C preprocessor on the ``ispc`` file; the result is
|
||||
piped to ``ispc`` to generate an object file and a header. As an example,
|
||||
see the file ``simple.vcxproj`` in the ``examples/simple`` directory of the
|
||||
``ispc`` distribution.
|
||||
``ispc`` automatically runs the C preprocessor on your input program before
|
||||
compiling it. (This functionality can be disabled with the ``--nocpp``
|
||||
command-line argument.)
|
||||
|
||||
Command-line Options
|
||||
--------------------
|
||||
@@ -340,7 +316,7 @@ before it's compiled. On Windows®, pre-processor definitions should be
|
||||
provided to the ``cl`` call.
|
||||
|
||||
By default, the compiler generates x86-64 Intel® SSE4 code. To generate
|
||||
32-bit code, you can use the the ``--arch=x86`` command-line flag. To
|
||||
32-bit code, you can use the ``--arch=x86`` command-line flag. To
|
||||
select Intel® SSE2, use ``--target=sse2``.
|
||||
|
||||
``ispc`` supports an alternative method for generating Intel® SSE4 code,
|
||||
@@ -453,7 +429,8 @@ The following identifiers are reserved as language keywords: ``bool``,
|
||||
``char``, ``cif``, ``cwhile``, ``const``, ``continue``, ``creturn``,
|
||||
``default``, ``do``, ``double``, ``else``, ``enum``, ``export``,
|
||||
``extern``, ``false``, ``float``, ``for``, ``goto``, ``if``, ``inline``, ``int``,
|
||||
``int32``, ``int64``, ``launch``, ``print``, ``reference``, ``return``,
|
||||
``int8``, ``int16``, ``int32``, ``int64``, ``launch``, ``print``,
|
||||
``reference``, ``return``,
|
||||
``signed``, ``sizeof``, ``soa``, ``static``, ``struct``, ``switch``,
|
||||
``sync``, ``task``, ``true``, ``typedef``, ``uniform``, ``union``,
|
||||
``unsigned``, ``varying``, ``void``, ``volatile``, ``while``.
|
||||
@@ -507,6 +484,10 @@ types.
|
||||
* ``void``: "empty" type representing no value.
|
||||
* ``bool``: boolean value; may be assigned ``true``, ``false``, or the
|
||||
value of a boolean expression.
|
||||
* ``int8``: 8-bit signed integer.
|
||||
* ``unsigned int8``: 8-bit unsigned integer.
|
||||
* ``int16``: 16-bit signed integer.
|
||||
* ``unsigned int16``: 16-bit unsigned integer.
|
||||
* ``int``: 32-bit signed integer; may also be specified as ``int32``.
|
||||
* ``unsigned int``: 32-bit unsigned integer; may also be specified as
|
||||
``unsigned int32``.
|
||||
@@ -523,7 +504,8 @@ general" of the two types, with the following precedence:
|
||||
|
||||
::
|
||||
|
||||
double > uint64 > int64 > float > uint32 > int32 > bool
|
||||
double > uint64 > int64 > float > uint32 > int32 >
|
||||
uint16 > int16 > uint8 > int8 > bool
|
||||
|
||||
In other words, adding an ``int64`` to a ``double`` causes the ``int64`` to
|
||||
be converted to a ``double``, the addition to be performed, and a
|
||||
@@ -536,11 +518,9 @@ is provided in parenthesis around the expression:
|
||||
double foo = 1. / 3.;
|
||||
int bar = (float)bar + (float)bar; // 32-bit float addition
|
||||
|
||||
Note: if a ``bool`` is converted to an integer numeric type (``int``,
|
||||
``int64``, etc.), then the conversion is done with sign extension, not zero
|
||||
extension. Thus, the resulting value has all bits set if the ``bool`` is
|
||||
``true``; for example, ``0xffffffff`` for ``int32``. This differs from C
|
||||
and C++, where a ``true`` bool is converted to the integer value one.
|
||||
If a ``bool`` is converted to an integer numeric type (``int``, ``int64``,
|
||||
etc.), then the result is the value one if the ``bool`` has the value
|
||||
``true`` and has the value zero otherwise.
|
||||
|
||||
Variables can be declared with the ``const`` qualifier, which prohibits
|
||||
their modification.
|
||||
@@ -579,6 +559,51 @@ results or modify existing variables.
|
||||
``ispc`` doesn't currently support pointer types.
|
||||
|
||||
|
||||
Enumeration Types
|
||||
-----------------
|
||||
|
||||
It is possible to define user-defined enumeration types in ``ispc`` with
|
||||
the ``enum`` keyword, which is followed by an option enumeration type name
|
||||
and then a brace-delimited list of enumerators with optional values:
|
||||
|
||||
::
|
||||
|
||||
enum Color { RED, GREEN, BLUE };
|
||||
enum Flags {
|
||||
UNINITIALIZED = 0,
|
||||
INITIALIZED = 2,
|
||||
CACHED = 4
|
||||
};
|
||||
|
||||
Each ``enum`` declaration defines a new type; an attempt to implicitly
|
||||
convert between enumerations of different types gives a compile-time error,
|
||||
but enuemrations of different types can be explicitly cast to one other.
|
||||
|
||||
::
|
||||
|
||||
Color c = (Color)CACHED;
|
||||
|
||||
Enumerators are implicitly converted to integer types, however, so they can
|
||||
be directly passed to routines that take integer parameters and can be used
|
||||
in expressions including integers, for example. However, the integer
|
||||
result of such an expression must be explicitly cast back to the enumerant
|
||||
type if it to be assigned to a variable with the enuemrant type.
|
||||
|
||||
::
|
||||
|
||||
Color c = RED;
|
||||
int nextColor = c+1;
|
||||
c = (Color)nextColor;
|
||||
|
||||
In this particular case, the explicit cast could be avoided using an
|
||||
increment operator.
|
||||
|
||||
::
|
||||
|
||||
Color c = RED;
|
||||
++c; // c == GREEN now
|
||||
|
||||
|
||||
Short Vector Types
|
||||
------------------
|
||||
|
||||
@@ -648,6 +673,15 @@ expect, though the two vector types must have the same length:
|
||||
int<4> bat = foo; // ERROR: different vector lengths
|
||||
float<4> bing = foo; // ERROR: different vector lengths
|
||||
|
||||
For convenience, short vectors can be initialized with a list of individual
|
||||
element values:
|
||||
|
||||
::
|
||||
|
||||
float x = ..., y = ..., z = ...;
|
||||
float<3> pos = { x, y, z };
|
||||
|
||||
|
||||
There are two mechanisms to access the individual elements of these short
|
||||
vector data types. The first is with the array indexing operator:
|
||||
|
||||
@@ -676,25 +710,24 @@ using the array indexing operator with an index that is greater than the
|
||||
vector size, accessing an element that is beyond the vector's size is
|
||||
undefined behavior and may cause your program to crash.
|
||||
|
||||
Note: ``ispc`` doesn't support the "swizzling" operations that languages
|
||||
like HLSL do. Only a single element of the vector can be accessed at a
|
||||
time with these member operators.
|
||||
It is also possible to construct new short vectors from other short vector
|
||||
values using this syntax, extended for "swizzling". For example,
|
||||
|
||||
::
|
||||
|
||||
float<3> foo = ...;
|
||||
float<2> bar = foo.xy; // ERROR
|
||||
foo.xz = ...; // ERROR
|
||||
func(foo.xyx); // ERROR
|
||||
float<3> position = ...;
|
||||
float<3> new_pos = position.zyx; // reverse order of components
|
||||
float<2> pos_2d = position.xy;
|
||||
|
||||
For convenience, short vectors can be initialized with a list of individual
|
||||
element values:
|
||||
Though a single element can be assigned to, as in the examples above, it is
|
||||
not currently possible to use swizzles on the left-hand side of assignment
|
||||
expressions:
|
||||
|
||||
::
|
||||
|
||||
float x = ..., y = ..., z = ...;
|
||||
float<3> pos = { x, y, z };
|
||||
|
||||
int8<2> foo = ...;
|
||||
int8<2> bar = ...;
|
||||
foo.yz = bar; // Error: can't assign to left-hand side of expression
|
||||
|
||||
Struct and Array Types
|
||||
----------------------
|
||||
@@ -765,22 +798,18 @@ Variables can also be declared in ``for`` statement initializers:
|
||||
|
||||
for (int i = 0; ...)
|
||||
|
||||
Arrays can be initialized with either a scalar value or with individual
|
||||
element values in braces:
|
||||
Arrays can be initialized with individual element values in braces:
|
||||
|
||||
::
|
||||
|
||||
int foo[10] = x; // all ten elements take the value of x
|
||||
int bar[2][4] = { { 1, 2, 3, 4 }, { 5, 6, 7, 8 } };
|
||||
|
||||
Structures can also be initialized both with scalar values or with element
|
||||
values in braces:
|
||||
Structures can also be initialized only with element values in braces:
|
||||
|
||||
::
|
||||
|
||||
struct Color { float r, g, b; };
|
||||
....
|
||||
Color c = 1; // all are one
|
||||
Color d = { 0.5, .75, 1.0 }; // r = 0.5, ...
|
||||
|
||||
|
||||
@@ -877,7 +906,6 @@ C Constructs not in ISPC
|
||||
|
||||
The following C features are not available in ``ispc``.
|
||||
|
||||
* ``enum`` s
|
||||
* Pointers and function pointers
|
||||
* ``char`` and ``short`` types
|
||||
* ``switch`` statements
|
||||
@@ -1246,7 +1274,7 @@ section.)
|
||||
For ``if`` statements where the different running SPMD program instances
|
||||
don't have coherent values for the boolean ``if`` test, using ``cif``
|
||||
introduces some additional overhead from the ``all`` and ``any`` tests as
|
||||
well as the corresponding branches. For cases where the the program
|
||||
well as the corresponding branches. For cases where the program
|
||||
instances often do compute the same boolean value, this overhead is
|
||||
worthwhile. If the control flow is in fact usually incoherent, this
|
||||
overhead only costs performance.
|
||||
@@ -1406,13 +1434,25 @@ parallel execution.
|
||||
|
||||
If you use the task launch feature in ``ispc``, you must provide C/C++
|
||||
implementations of two functions and link them into your final executable
|
||||
file:
|
||||
file. Although these functions may be implemented in either language, they
|
||||
must have "C" linkage (i.e. their prototypes must be declared inside an
|
||||
``extern "C"`` block if they are defined in C++.)
|
||||
|
||||
::
|
||||
|
||||
void ISPCLaunch(void *funcptr, void *data);
|
||||
void ISPCSync();
|
||||
|
||||
On Windows, two additional functions must be provided to dynamically
|
||||
allocate and free memory to store the arguments passed to tasks. (On OSX
|
||||
and Linux, the stack provides memory for task arguments; on Windows, the
|
||||
stack is generally not large enough to do this for large numbers of tasks.)
|
||||
|
||||
::
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
|
||||
These are called by the task launch code generated by the ``ispc``
|
||||
compiler; the first is called to launch to launch a task and the second is
|
||||
called to wait for, respectively. (Factoring them out in this way
|
||||
@@ -1659,14 +1699,14 @@ values for the inactive program instances aren't printed. (In other cases,
|
||||
they may have garbage values or be otherwise undefined.)
|
||||
|
||||
|
||||
Cross-Lane Operations
|
||||
---------------------
|
||||
Cross-Program Instance Operations
|
||||
---------------------------------
|
||||
|
||||
Usually, ``ispc`` code expresses independent computation on separate data
|
||||
elements. There are, however, a number of cases where it's useful for the
|
||||
program instances to be able to cooperate in computing results. The
|
||||
cross-lane operations described in this section provide primitives for
|
||||
communication between the running program instances.
|
||||
Usually, ``ispc`` code expresses independent programs performing
|
||||
computation on separate data elements. There are, however, a number of
|
||||
cases where it's useful for the program instances to be able to cooperate
|
||||
in computing results. The cross-lane operations described in this section
|
||||
provide primitives for communication between the running program instances.
|
||||
|
||||
A few routines that evaluate conditions across the running program
|
||||
instances. For example, ``any()`` returns ``true`` if the given value
|
||||
@@ -1678,6 +1718,70 @@ and ``all()`` returns ``true`` if it true for all of them.
|
||||
uniform bool any(bool v)
|
||||
uniform bool all(bool v)
|
||||
|
||||
To broadcast a value from one program instance to all of the others, a
|
||||
``broadcast()`` function is available. It broadcasts the value of the
|
||||
``value`` parameter for the program instance given by ``index`` to all of
|
||||
the running program instances.
|
||||
|
||||
::
|
||||
|
||||
int8 broadcast(int8 value, uniform int index)
|
||||
int16 broadcast(int16 value, uniform int index)
|
||||
int32 broadcast(int32 value, uniform int index)
|
||||
int64 broadcast(int64 value, uniform int index)
|
||||
float broadcast(float value, uniform int index)
|
||||
double broadcast(double value, uniform int index)
|
||||
|
||||
The ``rotate()`` function allows each program instance to find the value of
|
||||
the given value that their neighbor ``offset`` steps away has. For
|
||||
example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5,
|
||||
6, 7, 8) in each of the running program instances, then ``rotate(value,
|
||||
-1)`` causes the first program instance to get the value 8, the second
|
||||
program instance to get the value 1, the third 2, and so forth. The
|
||||
provided offset value can be positive or negative, and may be greater than
|
||||
``programCount`` (it is masked to ensure valid offsets).
|
||||
|
||||
::
|
||||
|
||||
int8 rotate(int8 value, uniform int offset)
|
||||
int16 rotate(int16 value, uniform int offset)
|
||||
int32 rotate(int32 value, uniform int offset)
|
||||
int64 rotate(int64 value, uniform int offset)
|
||||
float rotate(float value, uniform int offset)
|
||||
double rotate(double value, uniform int offset)
|
||||
|
||||
|
||||
Finally, the ``shuffle()`` functions allow two variants of fully general
|
||||
shuffling of values among the program instances. For the first version,
|
||||
each program instance's value of permutation gives the program instance
|
||||
from which to get the value of ``value``. The provided values for
|
||||
``permutation`` must all be between 0 and ``programCount-1``.
|
||||
|
||||
::
|
||||
|
||||
int8 shuffle(int8 value, int permutation)
|
||||
int16 shuffle(int16 value, int permutation)
|
||||
int32 shuffle(int32 value, int permutation)
|
||||
int64 shuffle(int64 value, int permutation)
|
||||
float shuffle(float value, int permutation)
|
||||
double shuffle(double value, int permutation)
|
||||
|
||||
|
||||
The second variant of ``shuffle()`` permutes over the extended vector that
|
||||
is the concatenation of the two provided values. In other words, a value
|
||||
of 0 in an element of ``permutation`` corresponds to the first element of
|
||||
``value0``, the value ``2*programCount-1`` corresponds to the last element
|
||||
of ``value1``, etc.)
|
||||
|
||||
::
|
||||
|
||||
int8 shuffle(int8 value0, int8 value1, int permutation)
|
||||
int16 shuffle(int16 value0, int16 value1, int permutation)
|
||||
int32 shuffle(int32 value0, int32 value1, int permutation)
|
||||
int64 shuffle(int64 value0, int64 value1, int permutation)
|
||||
float shuffle(float value0, float value1, int permutation)
|
||||
double shuffle(double value0, double value1, int permutation)
|
||||
|
||||
The various variants of ``popcnt()`` return the population count--the
|
||||
number of bits set in the given value.
|
||||
|
||||
@@ -1719,25 +1823,36 @@ given value across all of the currently-executing vector lanes.
|
||||
uniform unsigned int reduce_max(unsigned int a, unsigned int b)
|
||||
|
||||
|
||||
Finally, there are routines for writing out and reading in values from
|
||||
linear memory locations for the active program instances.
|
||||
``packed_load_active()`` loads consecutive values from the given array,
|
||||
starting at ``a[offset]``, loading one value for each currently-executing
|
||||
program instance and storing it into that program instance's ``val``
|
||||
variable. It returns the total number of values loaded. Similarly,
|
||||
``packed_store_active()`` stores the ``val`` values for each program
|
||||
instances that executed the ``packed_store_active()`` call, storing the
|
||||
results into the given array starting at the given offset. It returns the
|
||||
total number of values stored.
|
||||
|
||||
Packed Load and Store Operations
|
||||
--------------------------------
|
||||
|
||||
The standard library also offers routines for writing out and reading in
|
||||
values from linear memory locations for the active program instances. The
|
||||
``packed_load_active()`` functions load consecutive values from the given
|
||||
array, starting at ``a[offset]``, loading one value for each
|
||||
currently-executing program instance and storing it into that program
|
||||
instance's ``val`` variable. They return the total number of values
|
||||
loaded. Similarly, the ``packed_store_active()`` functions store the
|
||||
``val`` values for each program instances that executed the
|
||||
``packed_store_active()`` call, storing the results into the given array
|
||||
starting at the given offset. They return the total number of values
|
||||
stored.
|
||||
|
||||
::
|
||||
|
||||
uniform unsigned int packed_load_active(uniform int a[],
|
||||
uniform int offset,
|
||||
reference int val)
|
||||
uniform unsigned int packed_store_active(uniform int a[],
|
||||
uniform int offset,
|
||||
int val)
|
||||
uniform int packed_load_active(uniform int a[],
|
||||
uniform int offset,
|
||||
reference int val)
|
||||
uniform int packed_load_active(uniform unsigned int a[],
|
||||
uniform int offset,
|
||||
reference unsigned int val)
|
||||
uniform int packed_store_active(uniform int a[],
|
||||
uniform int offset,
|
||||
int val)
|
||||
uniform int packed_store_active(uniform unsigned int a[],
|
||||
uniform int offset,
|
||||
unsigned int val)
|
||||
|
||||
|
||||
As an example of how these functions can be used, the following code shows
|
||||
@@ -1770,41 +1885,123 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``
|
||||
|
||||
::
|
||||
|
||||
uniform int8 extract(int8 x, uniform int i)
|
||||
uniform int16 extract(int16 x, uniform int i)
|
||||
uniform int32 extract(int32 x, uniform int i)
|
||||
uniform int64 extract(int64 x, uniform int i)
|
||||
uniform float extract(float x, uniform int i)
|
||||
uniform int extract(int x, uniform int i)
|
||||
|
||||
::
|
||||
|
||||
int8 insert(int8 x, uniform int i, uniform int8 v)
|
||||
int16 insert(int16 x, uniform int i, uniform int16 v)
|
||||
int32 insert(int32 x, uniform int i, uniform int32 v)
|
||||
int64 insert(int64 x, uniform int i, uniform int64 v)
|
||||
float insert(float x, uniform int i, uniform float v)
|
||||
int insert(int x, uniform int i, uniform int v)
|
||||
|
||||
|
||||
Conversions To and From Half-Precision Floats
|
||||
---------------------------------------------
|
||||
|
||||
There are functions to convert to and from the IEEE 16-bit floating-point
|
||||
format. Note that there is no ``half`` data-type, and it isn't possible
|
||||
to do floating-point math directly with ``half`` types in ``ispc``; these
|
||||
functions facilitate converting to and from half-format data in memory.
|
||||
|
||||
To use them, half-format data should be loaded into an ``int16`` and the
|
||||
``half_to_float()`` function used to convert it the a 32-bit floating point
|
||||
value. To store a value to memory in half format, the ``float_to_half()``
|
||||
function returns the 16 bits that are the closest match to the given
|
||||
``float``, in half format.
|
||||
|
||||
::
|
||||
|
||||
float half_to_float(unsigned int16 h)
|
||||
uniform float half_to_float(uniform unsigned int16 h)
|
||||
int16 float_to_half(float f)
|
||||
uniform int16 float_to_half(uniform float f)
|
||||
|
||||
|
||||
Atomic Operations and Memory Fences
|
||||
-----------------------------------
|
||||
|
||||
The usual range of atomic memory operations are provided in ``ispc``. As an
|
||||
example, consider the 32-bit integer atomic add routine:
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_add_global(reference uniform int32 val, int32 delta)
|
||||
|
||||
The semantics are the expected ones for an atomic add function: the value
|
||||
"val" has the value "delta" added to it atomically, and the old value of
|
||||
"val" is returned from the function. (Thus, if multiple processors
|
||||
simultaneously issue atomic adds to the same memory location, the adds will
|
||||
be serialized by the hardware so that the correct result is computed in the
|
||||
end.)
|
||||
|
||||
One thing to note is that that the value being added to here is a
|
||||
``uniform`` integer, while the increment amount and the return value are
|
||||
``varying``. In other words, the semantics are that each running program
|
||||
instance individually issues the atomic operation with its own ``delta``
|
||||
value and gets the previous value of ``val`` back in return. The atomics
|
||||
for the running program instances may be issued in arbitrary order; it's
|
||||
not guaranteed that they will be issued in ``programIndex`` order, for
|
||||
example.
|
||||
|
||||
Here are the declarations of the ``int32`` variants of these functions.
|
||||
There are also ``int64`` equivalents as well as variants that take
|
||||
``unsigned`` ``int32`` and ``int64`` values. (The ``atomic_swap_global()``
|
||||
function can be used with ``float`` and ``double`` types as well.)
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_add_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_subtract_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_min_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_max_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_and_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_or_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_xor_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_swap_global(reference uniform int32 val, int32 newval)
|
||||
|
||||
There is also an atomic "compare and exchange" function; it atomically
|
||||
compares the value in "val" to "compare"--if they match, it assigns
|
||||
"newval" to "val". In either case, the old value of "val" is returned.
|
||||
(As with the other atomic operations, there are also ``unsigned`` and
|
||||
64-bit variants of this function. Furthermore, there are ``float`` and
|
||||
``double`` variants as well.)
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_compare_exchange_global(reference uniform int32 val,
|
||||
int32 compare, int32 newval)
|
||||
|
||||
``ispc`` also has a standard library routine that inserts a memory barrier
|
||||
into the code; it ensures that all memory reads and writes prior to be
|
||||
barrier complete before any reads or writes after the barrier are issued.
|
||||
See the `Linux kernel documentation on memory barriers`_ for an excellent
|
||||
writeup on the need for and the use of memory barriers in multi-threaded
|
||||
code.
|
||||
|
||||
.. _Linux kernel documentation on memory barriers: http://www.kernel.org/doc/Documentation/memory-barriers.txt
|
||||
|
||||
::
|
||||
|
||||
void memory_barrier();
|
||||
|
||||
|
||||
Low-Level Bits
|
||||
--------------
|
||||
|
||||
``ispc`` provides a number of bit/memory-level utility routines in its
|
||||
standard library as well. It has routines that load from and store
|
||||
to 8-bit and 16-bit integer values stored in memory, converting to and from
|
||||
32-bit integers for use in computation in ``ispc`` code. (These functions
|
||||
and this conversion step are necessary because ``ispc`` doesn't have native
|
||||
8-bit or 16-bit types in the language.)
|
||||
Sometimes it's useful to convert a ``bool`` value to an integer using sign
|
||||
extension so that the integer's bits are all on if the ``bool`` has the
|
||||
value ``true`` (rather than just having the value one). The
|
||||
``sign_extend()`` functions provide this functionality:
|
||||
|
||||
::
|
||||
|
||||
unsigned int load_from_int8(uniform int a[],
|
||||
uniform int offset)
|
||||
void store_to_int8(uniform int a[], uniform int offset,
|
||||
unsigned int val)
|
||||
unsigned int load_from_int16(uniform int a[],
|
||||
uniform int offset)
|
||||
void store_to_int16(uniform int a[], uniform int offset,
|
||||
unsigned int val)
|
||||
|
||||
There are two things to note in these functions. First, note that these
|
||||
functions take ``unsigned int`` arrays as parameters; you need
|
||||
to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side to
|
||||
``unsigned int`` when passing them to ``ispc`` code. Second, although the
|
||||
arrays are passed as ``unsigned int``, in the array indexing calculation,
|
||||
with the ``offset`` parameter, they are treated as if they were ``int8`` or
|
||||
``int16`` types. (i.e. the offset treated as being in terms of number of 8
|
||||
or 16-bit elements.)
|
||||
int sign_extend(bool value)
|
||||
uniform int sign_extend(uniform bool value)
|
||||
|
||||
The ``intbits()`` and ``floatbits()`` functions can be used to implement
|
||||
low-level floating-point bit twiddling. For example, ``intbits()`` returns
|
||||
@@ -1840,7 +2037,6 @@ It, it clears the high order bit, to ensure that the given floating-point
|
||||
value is positive. This compiles down to a single ``andps`` instruction
|
||||
when used with an Intel® SSE target, for example.
|
||||
|
||||
|
||||
Interoperability with the Application
|
||||
=====================================
|
||||
|
||||
@@ -2279,21 +2475,11 @@ elements to work with and then proceeds with the computation.
|
||||
Communicating Between SPMD Program Instances
|
||||
--------------------------------------------
|
||||
|
||||
The ``programIndex`` built-in variable (see `Mapping Data To Program
|
||||
Instances`_) can be used to communicate between the set of executing
|
||||
program instances. Consider the following code, which shows all of the
|
||||
program instances writing into unique locations in an array.
|
||||
|
||||
::
|
||||
|
||||
float x = ...;
|
||||
uniform float allX[programCount];
|
||||
allX[programIndex] = x;
|
||||
|
||||
In this code, a program instance that reads ``allX[0]`` finds the value of
|
||||
``x`` that was computed by the first of the running program instances, and
|
||||
so forth. Program instances can communicate with their neighbor instances
|
||||
with indexing like ``allX[(programIndex+1)%programCount]``.
|
||||
The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
|
||||
routines provide a variety of mechanisms for the running program instances
|
||||
to communicate values to each other during execution. See the section
|
||||
`Cross-Program Instance Operations`_ for more information about their
|
||||
operation.
|
||||
|
||||
|
||||
Gather and Scatter
|
||||
@@ -2351,6 +2537,15 @@ do a vector load. For example, given:
|
||||
|
||||
A regular vector load is done from array, starting at offset ``2*x``.
|
||||
|
||||
|
||||
8 and 16-bit Integer Types
|
||||
--------------------------
|
||||
|
||||
The code generated for 8 and 16-bit integer types is generally not as
|
||||
efficient as the code generated for 32-bit integer types. It is generally
|
||||
worthwhile to use 32-bit integer types for intermediate computations, even
|
||||
if the final result will be stored in a smaller integer type.
|
||||
|
||||
Low-level Vector Tricks
|
||||
-----------------------
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.0
|
||||
PROJECT_NUMBER = 1.0.5
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
@@ -610,7 +610,7 @@ INPUT = builtins.h \
|
||||
util.cpp \
|
||||
parse.yy \
|
||||
lex.ll \
|
||||
stdlib-c.c
|
||||
builtins-c.c
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
|
||||
|
||||
@@ -57,6 +57,13 @@ Linux, a pthreads-based task system is used (tasks_pthreads.cpp). When
|
||||
using tasks with ispc, no task system is mandated; the user is free to plug
|
||||
in any task system they want, for ease of interoperating with existing task
|
||||
systems.
|
||||
|
||||
Noise
|
||||
=====
|
||||
|
||||
This example has an implementation of Ken Perlin's procedural "noise"
|
||||
function, as described in his 2002 "Improving Noise" SIGGRAPH paper.
|
||||
|
||||
|
||||
Options
|
||||
=======
|
||||
|
||||
2
examples/aobench/.gitignore
vendored
Normal file
2
examples/aobench/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
ao
|
||||
*.ppm
|
||||
@@ -55,6 +55,7 @@
|
||||
using namespace ispc;
|
||||
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
@@ -103,6 +104,38 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 4) {
|
||||
@@ -117,6 +150,8 @@ int main(int argc, char **argv)
|
||||
height = atoi (argv[3]);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
@@ -25,15 +25,15 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
@@ -102,6 +102,8 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -115,6 +117,8 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -130,6 +134,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -147,6 +152,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -158,4 +164,4 @@
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
2
examples/aobench_instrumented/.gitignore
vendored
Normal file
2
examples/aobench_instrumented/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
ao
|
||||
*.ppm
|
||||
@@ -56,6 +56,7 @@ using namespace ispc;
|
||||
|
||||
#include "instrument.h"
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
@@ -102,6 +103,38 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 4) {
|
||||
@@ -116,6 +149,8 @@ int main(int argc, char **argv)
|
||||
height = atoi (argv[3]);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
@@ -25,15 +25,15 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
|
||||
66
examples/cpuid.h
Normal file
66
examples/cpuid.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef ISPC_CPUID_H
|
||||
#define ISPC_CPUID_H 1
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// Provides a __cpuid() function with same signature as below
|
||||
#include <intrin.h>
|
||||
#else
|
||||
static void __cpuid(int info[4], int infoType) {
|
||||
__asm__ __volatile__ ("cpuid"
|
||||
: "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
: "0" (infoType));
|
||||
}
|
||||
#endif
|
||||
|
||||
inline bool CPUSupportsSSE2() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[3] & (1 << 26)) != 0;
|
||||
}
|
||||
|
||||
inline bool CPUSupportsSSE4() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 19)) != 0;
|
||||
}
|
||||
|
||||
inline bool CPUSupportsAVX() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 28)) != 0;
|
||||
}
|
||||
|
||||
#endif // ISPC_CPUID_H
|
||||
@@ -15,6 +15,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelb
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
@@ -79,6 +81,14 @@ Global
|
||||
{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
|
||||
{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
|
||||
{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
|
||||
{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -66,6 +67,38 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 512;
|
||||
@@ -77,6 +110,8 @@ int main() {
|
||||
int maxIterations = 256;
|
||||
int *buf = new int[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -81,6 +81,8 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -94,6 +96,8 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -109,6 +113,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -126,6 +131,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -141,15 +147,15 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
|
||||
2
examples/mandelbrot_tasks/.gitignore
vendored
Normal file
2
examples/mandelbrot_tasks/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
mandelbrot
|
||||
*.ppm
|
||||
@@ -41,6 +41,7 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -66,6 +67,38 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 1536;
|
||||
unsigned int height = 1024;
|
||||
@@ -74,6 +107,8 @@ int main() {
|
||||
float y0 = -1;
|
||||
float y1 = 1;
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
extern void TasksInit();
|
||||
TasksInit();
|
||||
|
||||
|
||||
@@ -81,6 +81,8 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -94,6 +96,8 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -109,6 +113,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -126,6 +131,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -142,15 +148,15 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
@@ -159,4 +165,4 @@
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -37,13 +37,17 @@
|
||||
#include <windows.h>
|
||||
#include <concrt.h>
|
||||
using namespace Concurrency;
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *f, void *data);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
}
|
||||
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
@@ -61,12 +65,14 @@ static int taskOffset;
|
||||
static TaskInfo taskInfo[MAX_TASKS];
|
||||
static event *events[MAX_TASKS];
|
||||
static CRITICAL_SECTION criticalSection;
|
||||
static bool initialized = false;
|
||||
|
||||
void
|
||||
TasksInit() {
|
||||
InitializeCriticalSection(&criticalSection);
|
||||
for (int i = 0; i < MAX_TASKS; ++i)
|
||||
events[i] = new event;
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
||||
@@ -91,6 +97,11 @@ lRunTask(LPVOID param) {
|
||||
|
||||
void
|
||||
ISPCLaunch(void *func, void *data) {
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Get a TaskInfo struct for this task
|
||||
EnterCriticalSection(&criticalSection);
|
||||
TaskInfo *ti = &taskInfo[taskOffset++];
|
||||
@@ -105,6 +116,11 @@ ISPCLaunch(void *func, void *data) {
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
event::wait_for_multiple(&events[0], taskOffset, true,
|
||||
COOPERATIVE_TIMEOUT_INFINITE);
|
||||
|
||||
@@ -113,3 +129,13 @@ void ISPCSync() {
|
||||
|
||||
taskOffset = 0;
|
||||
}
|
||||
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
return _aligned_malloc(size, alignment);
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
|
||||
@@ -35,7 +35,10 @@
|
||||
Dispatch. */
|
||||
|
||||
#include <dispatch/dispatch.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static bool initialized = false;
|
||||
static dispatch_queue_t gcdQueue;
|
||||
static dispatch_group_t gcdGroup;
|
||||
|
||||
@@ -55,6 +58,7 @@ void
|
||||
TasksInit() {
|
||||
gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
||||
gcdGroup = dispatch_group_create();
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
||||
@@ -77,6 +81,10 @@ lRunTask(void *ti) {
|
||||
|
||||
|
||||
void ISPCLaunch(void *func, void *data) {
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
TaskInfo *ti = new TaskInfo;
|
||||
ti->func = func;
|
||||
ti->data = data;
|
||||
@@ -85,6 +93,11 @@ void ISPCLaunch(void *func, void *data) {
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
if (!initialized) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Wait for all of the tasks in the group to complete before returning
|
||||
dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
|
||||
}
|
||||
|
||||
@@ -135,6 +135,11 @@ TasksInit() {
|
||||
|
||||
void
|
||||
ISPCLaunch(void *f, void *d) {
|
||||
if (threads == NULL) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Acquire mutex, add task
|
||||
//
|
||||
@@ -256,6 +261,11 @@ lTaskEntry(void *arg) {
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
if (threads == NULL) {
|
||||
fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int err;
|
||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
|
||||
3
examples/noise/.gitignore
vendored
Normal file
3
examples/noise/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
noise
|
||||
*.ppm
|
||||
objs
|
||||
26
examples/noise/Makefile
Normal file
26
examples/noise/Makefile
Normal file
@@ -0,0 +1,26 @@
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||
|
||||
default: noise
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ noise
|
||||
|
||||
noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/noise.o: objs/noise_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
150
examples/noise/noise.cpp
Normal file
150
examples/noise/noise.cpp
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "noise_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
extern void noise_serial(float x0, float y0, float x1, float y1,
|
||||
int width, int height, float output[]);
|
||||
|
||||
/* Write a PPM image file with the image */
|
||||
static void
|
||||
writePPM(float *buf, int width, int height, const char *fn) {
|
||||
FILE *fp = fopen(fn, "wb");
|
||||
fprintf(fp, "P6\n");
|
||||
fprintf(fp, "%d %d\n", width, height);
|
||||
fprintf(fp, "255\n");
|
||||
for (int i = 0; i < width*height; ++i) {
|
||||
float v = buf[i] * 255.f;
|
||||
if (v < 0) v = 0;
|
||||
if (v > 255) v = 255;
|
||||
for (int j = 0; j < 3; ++j)
|
||||
fputc((char)v, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 768;
|
||||
float x0 = -10;
|
||||
float x1 = 10;
|
||||
float y0 = -10;
|
||||
float y1 = 10;
|
||||
|
||||
float *buf = new float[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
//
|
||||
double minISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
noise_ispc(x0, y0, x1, y1, width, height, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minISPC = std::min(minISPC, dt);
|
||||
}
|
||||
|
||||
printf("[noise ispc]:\t\t\t[%.3f] million cycles\n", minISPC);
|
||||
writePPM(buf, width, height, "noise-ispc.ppm");
|
||||
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
noise_serial(x0, y0, x1, y1, width, height, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minSerial = std::min(minSerial, dt);
|
||||
}
|
||||
|
||||
printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial);
|
||||
writePPM(buf, width, height, "noise-serial.ppm");
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
|
||||
|
||||
return 0;
|
||||
}
|
||||
164
examples/noise/noise.ispc
Normal file
164
examples/noise/noise.ispc
Normal file
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#define NOISE_PERM_SIZE 256
|
||||
|
||||
static uniform int NoisePerm[2 * NOISE_PERM_SIZE] = {
|
||||
151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140,
|
||||
36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120,
|
||||
234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33,
|
||||
88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168, 68, 175, 74, 165, 71,
|
||||
134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133,
|
||||
230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161,
|
||||
1, 216, 80, 73, 209, 76, 132, 187, 208, 89, 18, 169, 200, 196, 135, 130,
|
||||
116, 188, 159, 86, 164, 100, 109, 198, 173, 186, 3, 64, 52, 217, 226, 250,
|
||||
124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227,
|
||||
47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152, 2, 44,
|
||||
154, 163, 70, 221, 153, 101, 155, 167, 43, 172, 9, 129, 22, 39, 253, 19,
|
||||
98, 108, 110, 79, 113, 224, 232, 178, 185, 112, 104, 218, 246, 97, 228, 251,
|
||||
34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
|
||||
14, 239, 107, 49, 192, 214, 31, 181, 199, 106, 157, 184, 84, 204, 176, 115,
|
||||
121, 50, 45, 127, 4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72,
|
||||
243, 141, 128, 195, 78, 66, 215, 61, 156, 180, 151, 160, 137, 91, 90, 15,
|
||||
131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99,
|
||||
37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252,
|
||||
219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125,
|
||||
136, 171, 168, 68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158,
|
||||
231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245,
|
||||
40, 244, 102, 143, 54, 65, 25, 63, 161, 1, 216, 80, 73, 209, 76, 132, 187,
|
||||
208, 89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109,
|
||||
198, 173, 186, 3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118,
|
||||
126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42,
|
||||
223, 183, 170, 213, 119, 248, 152, 2, 44, 154, 163, 70, 221, 153, 101, 155,
|
||||
167, 43, 172, 9, 129, 22, 39, 253, 19, 98, 108, 110, 79, 113, 224, 232,
|
||||
178, 185, 112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144,
|
||||
12, 191, 179, 162, 241, 81, 51, 145, 235, 249, 14, 239, 107, 49, 192, 214,
|
||||
31, 181, 199, 106, 157, 184, 84, 204, 176, 115, 121, 50, 45, 127, 4, 150,
|
||||
254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78,
|
||||
66, 215, 61, 156, 180
|
||||
};
|
||||
|
||||
|
||||
inline float SmoothStep(float low, float high, float value) {
|
||||
float v = clamp((value - low) / (high - low), 0.f, 1.f);
|
||||
return v * v * (-2.f * v + 3.f);
|
||||
}
|
||||
|
||||
|
||||
inline int Floor2Int(float val) {
|
||||
return (int)floor(val);
|
||||
}
|
||||
|
||||
|
||||
inline float Grad(int x, int y, int z, float dx, float dy, float dz) {
|
||||
int h = NoisePerm[NoisePerm[NoisePerm[x]+y]+z];
|
||||
h &= 15;
|
||||
float u = h<8 || h==12 || h==13 ? dx : dy;
|
||||
float v = h<4 || h==12 || h==13 ? dy : dz;
|
||||
return ((h&1) ? -u : u) + ((h&2) ? -v : v);
|
||||
}
|
||||
|
||||
|
||||
inline float NoiseWeight(float t) {
|
||||
float t3 = t*t*t;
|
||||
float t4 = t3*t;
|
||||
return 6.f*t4*t - 15.f*t4 + 10.f*t3;
|
||||
}
|
||||
|
||||
|
||||
inline float Lerp(float t, float low, float high) {
|
||||
return (1. - t) * low + t * high;
|
||||
}
|
||||
|
||||
|
||||
static float Noise(float x, float y, float z) {
|
||||
// Compute noise cell coordinates and offsets
|
||||
int ix = Floor2Int(x), iy = Floor2Int(y), iz = Floor2Int(z);
|
||||
float dx = x - ix, dy = y - iy, dz = z - iz;
|
||||
|
||||
// Compute gradient weights
|
||||
ix &= (NOISE_PERM_SIZE-1);
|
||||
iy &= (NOISE_PERM_SIZE-1);
|
||||
iz &= (NOISE_PERM_SIZE-1);
|
||||
float w000 = Grad(ix, iy, iz, dx, dy, dz);
|
||||
float w100 = Grad(ix+1, iy, iz, dx-1, dy, dz);
|
||||
float w010 = Grad(ix, iy+1, iz, dx, dy-1, dz);
|
||||
float w110 = Grad(ix+1, iy+1, iz, dx-1, dy-1, dz);
|
||||
float w001 = Grad(ix, iy, iz+1, dx, dy, dz-1);
|
||||
float w101 = Grad(ix+1, iy, iz+1, dx-1, dy, dz-1);
|
||||
float w011 = Grad(ix, iy+1, iz+1, dx, dy-1, dz-1);
|
||||
float w111 = Grad(ix+1, iy+1, iz+1, dx-1, dy-1, dz-1);
|
||||
|
||||
// Compute trilinear interpolation of weights
|
||||
float wx = NoiseWeight(dx), wy = NoiseWeight(dy), wz = NoiseWeight(dz);
|
||||
float x00 = Lerp(wx, w000, w100);
|
||||
float x10 = Lerp(wx, w010, w110);
|
||||
float x01 = Lerp(wx, w001, w101);
|
||||
float x11 = Lerp(wx, w011, w111);
|
||||
float y0 = Lerp(wy, x00, x10);
|
||||
float y1 = Lerp(wy, x01, x11);
|
||||
return Lerp(wz, y0, y1);
|
||||
}
|
||||
|
||||
|
||||
static float Turbulence(float x, float y, float z, int octaves) {
|
||||
float omega = 0.6;
|
||||
|
||||
float sum = 0., lambda = 1., o = 1.;
|
||||
for (int i = 0; i < octaves; ++i) {
|
||||
sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
|
||||
lambda *= 1.99f;
|
||||
o *= omega;
|
||||
}
|
||||
return sum * 0.5;
|
||||
}
|
||||
|
||||
|
||||
export void noise_ispc(uniform float x0, uniform float y0, uniform float x1,
|
||||
uniform float y1, uniform int width, uniform int height,
|
||||
uniform float output[])
|
||||
{
|
||||
uniform float dx = (x1 - x0) / width;
|
||||
uniform float dy = (y1 - y0) / height;
|
||||
|
||||
for (uniform int j = 0; j < height; j++) {
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
float x = x0 + (i + programIndex) * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = (j * width + i + programIndex);
|
||||
output[index] = Turbulence(x, y, 0.6, 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
167
examples/noise/noise.vcxproj
Executable file
167
examples/noise/noise.vcxproj
Executable file
@@ -0,0 +1,167 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>noise</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="noise.cpp" />
|
||||
<ClCompile Include="noise_serial.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="noise.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
170
examples/noise/noise_serial.cpp
Normal file
170
examples/noise/noise_serial.cpp
Normal file
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#define NOISE_PERM_SIZE 256
|
||||
|
||||
static int NoisePerm[2 * NOISE_PERM_SIZE] = {
|
||||
151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140,
|
||||
36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120,
|
||||
234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33,
|
||||
88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168, 68, 175, 74, 165, 71,
|
||||
134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133,
|
||||
230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161,
|
||||
1, 216, 80, 73, 209, 76, 132, 187, 208, 89, 18, 169, 200, 196, 135, 130,
|
||||
116, 188, 159, 86, 164, 100, 109, 198, 173, 186, 3, 64, 52, 217, 226, 250,
|
||||
124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227,
|
||||
47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152, 2, 44,
|
||||
154, 163, 70, 221, 153, 101, 155, 167, 43, 172, 9, 129, 22, 39, 253, 19,
|
||||
98, 108, 110, 79, 113, 224, 232, 178, 185, 112, 104, 218, 246, 97, 228, 251,
|
||||
34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
|
||||
14, 239, 107, 49, 192, 214, 31, 181, 199, 106, 157, 184, 84, 204, 176, 115,
|
||||
121, 50, 45, 127, 4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72,
|
||||
243, 141, 128, 195, 78, 66, 215, 61, 156, 180, 151, 160, 137, 91, 90, 15,
|
||||
131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99,
|
||||
37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252,
|
||||
219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125,
|
||||
136, 171, 168, 68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158,
|
||||
231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245,
|
||||
40, 244, 102, 143, 54, 65, 25, 63, 161, 1, 216, 80, 73, 209, 76, 132, 187,
|
||||
208, 89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109,
|
||||
198, 173, 186, 3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118,
|
||||
126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42,
|
||||
223, 183, 170, 213, 119, 248, 152, 2, 44, 154, 163, 70, 221, 153, 101, 155,
|
||||
167, 43, 172, 9, 129, 22, 39, 253, 19, 98, 108, 110, 79, 113, 224, 232,
|
||||
178, 185, 112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144,
|
||||
12, 191, 179, 162, 241, 81, 51, 145, 235, 249, 14, 239, 107, 49, 192, 214,
|
||||
31, 181, 199, 106, 157, 184, 84, 204, 176, 115, 121, 50, 45, 127, 4, 150,
|
||||
254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78,
|
||||
66, 215, 61, 156, 180
|
||||
};
|
||||
|
||||
|
||||
inline float Clamp(float v, float low, float high) {
|
||||
return v < low ? low : ((v > high) ? high : v);
|
||||
}
|
||||
|
||||
|
||||
inline float SmoothStep(float low, float high, float value) {
|
||||
float v = Clamp((value - low) / (high - low), 0.f, 1.f);
|
||||
return v * v * (-2.f * v + 3.f);
|
||||
}
|
||||
|
||||
|
||||
inline int Floor2Int(float val) {
|
||||
return (int)floorf(val);
|
||||
}
|
||||
|
||||
|
||||
inline float Grad(int x, int y, int z, float dx, float dy, float dz) {
|
||||
int h = NoisePerm[NoisePerm[NoisePerm[x]+y]+z];
|
||||
h &= 15;
|
||||
float u = h<8 || h==12 || h==13 ? dx : dy;
|
||||
float v = h<4 || h==12 || h==13 ? dy : dz;
|
||||
return ((h&1) ? -u : u) + ((h&2) ? -v : v);
|
||||
}
|
||||
|
||||
|
||||
inline float NoiseWeight(float t) {
|
||||
float t3 = t*t*t;
|
||||
float t4 = t3*t;
|
||||
return 6.f*t4*t - 15.f*t4 + 10.f*t3;
|
||||
}
|
||||
|
||||
|
||||
inline float Lerp(float t, float low, float high) {
|
||||
return (1. - t) * low + t * high;
|
||||
}
|
||||
|
||||
|
||||
static float Noise(float x, float y, float z) {
|
||||
// Compute noise cell coordinates and offsets
|
||||
int ix = Floor2Int(x), iy = Floor2Int(y), iz = Floor2Int(z);
|
||||
float dx = x - ix, dy = y - iy, dz = z - iz;
|
||||
|
||||
// Compute gradient weights
|
||||
ix &= (NOISE_PERM_SIZE-1);
|
||||
iy &= (NOISE_PERM_SIZE-1);
|
||||
iz &= (NOISE_PERM_SIZE-1);
|
||||
float w000 = Grad(ix, iy, iz, dx, dy, dz);
|
||||
float w100 = Grad(ix+1, iy, iz, dx-1, dy, dz);
|
||||
float w010 = Grad(ix, iy+1, iz, dx, dy-1, dz);
|
||||
float w110 = Grad(ix+1, iy+1, iz, dx-1, dy-1, dz);
|
||||
float w001 = Grad(ix, iy, iz+1, dx, dy, dz-1);
|
||||
float w101 = Grad(ix+1, iy, iz+1, dx-1, dy, dz-1);
|
||||
float w011 = Grad(ix, iy+1, iz+1, dx, dy-1, dz-1);
|
||||
float w111 = Grad(ix+1, iy+1, iz+1, dx-1, dy-1, dz-1);
|
||||
|
||||
// Compute trilinear interpolation of weights
|
||||
float wx = NoiseWeight(dx), wy = NoiseWeight(dy), wz = NoiseWeight(dz);
|
||||
float x00 = Lerp(wx, w000, w100);
|
||||
float x10 = Lerp(wx, w010, w110);
|
||||
float x01 = Lerp(wx, w001, w101);
|
||||
float x11 = Lerp(wx, w011, w111);
|
||||
float y0 = Lerp(wy, x00, x10);
|
||||
float y1 = Lerp(wy, x01, x11);
|
||||
return Lerp(wz, y0, y1);
|
||||
}
|
||||
|
||||
|
||||
static float Turbulence(float x, float y, float z, int octaves) {
|
||||
float omega = 0.6;
|
||||
|
||||
float sum = 0., lambda = 1., o = 1.;
|
||||
for (int i = 0; i < octaves; ++i) {
|
||||
sum += fabsf(o * Noise(lambda * x, lambda * y, lambda * z));
|
||||
lambda *= 1.99f;
|
||||
o *= omega;
|
||||
}
|
||||
return sum * 0.5;
|
||||
}
|
||||
|
||||
|
||||
void noise_serial(float x0, float y0, float x1, float y1,
|
||||
int width, int height, float output[])
|
||||
{
|
||||
float dx = (x1 - x0) / width;
|
||||
float dy = (y1 - y0) / height;
|
||||
|
||||
for (int j = 0; j < height; j++) {
|
||||
for (int i = 0; i < width; ++i) {
|
||||
float x = x0 + i * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = (j * width + i);
|
||||
output[index] = Turbulence(x, y, 0.6, 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
1
examples/options/.gitignore
vendored
Normal file
1
examples/options/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
options
|
||||
@@ -41,6 +41,7 @@ using std::max;
|
||||
|
||||
#include "options_defs.h"
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#include "options_ispc.h"
|
||||
using namespace ispc;
|
||||
@@ -53,9 +54,41 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count);
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
// Pointers passed to ispc code must have alignment of the target's
|
||||
// vector width at minimum.
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
float *S = new float[N_OPTIONS];
|
||||
float *X = new float[N_OPTIONS];
|
||||
float *T = new float[N_OPTIONS];
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -82,6 +82,8 @@
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -96,6 +98,8 @@
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -112,6 +116,7 @@
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -130,6 +135,7 @@
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -145,15 +151,15 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="options.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
|
||||
2
examples/rt/.gitignore
vendored
Normal file
2
examples/rt/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
rt
|
||||
*.ppm
|
||||
@@ -44,6 +44,7 @@
|
||||
#include <assert.h>
|
||||
#include <sys/types.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "rt_ispc.h"
|
||||
|
||||
using namespace ispc;
|
||||
@@ -92,12 +93,46 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "usage: rt <filename base>\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
#define READ(var, n) \
|
||||
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
|
||||
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
|
||||
@@ -155,7 +190,9 @@ int main(int argc, char *argv[]) {
|
||||
nodes[i].bounds[1].v[1] = b[4];
|
||||
nodes[i].bounds[1].v[2] = b[5];
|
||||
READ(nodes[i].offset, 1);
|
||||
READ(nodes[i].primsAxis, 1);
|
||||
READ(nodes[i].nPrimitives, 1);
|
||||
READ(nodes[i].splitAxis, 1);
|
||||
READ(nodes[i].pad, 1);
|
||||
}
|
||||
|
||||
// And then read the triangles
|
||||
|
||||
@@ -50,21 +50,11 @@ struct Triangle {
|
||||
struct LinearBVHNode {
|
||||
uniform float3 bounds[2];
|
||||
uniform unsigned int offset; // num primitives for leaf, second child for interior
|
||||
uniform unsigned int primsAxis; // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
|
||||
uniform unsigned int8 nPrimitives;
|
||||
uniform unsigned int8 splitAxis;
|
||||
uniform unsigned int16 pad;
|
||||
};
|
||||
|
||||
static inline uniform int nPrims(const reference LinearBVHNode node) {
|
||||
return (node.primsAxis & 0xff);
|
||||
}
|
||||
|
||||
static inline uniform int axis(const reference LinearBVHNode node) {
|
||||
return ((node.primsAxis >> 8) & 0xff);
|
||||
}
|
||||
|
||||
static inline uniform bool isInterior(const reference LinearBVHNode node) {
|
||||
return nPrims(node) == 0;
|
||||
}
|
||||
|
||||
static inline float3 Cross(const float3 v1, const float3 v2) {
|
||||
float v1x = v1.x, v1y = v1.y, v1z = v1.z;
|
||||
float v2x = v2.x, v2y = v2.y, v2z = v2.z;
|
||||
@@ -199,7 +189,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
// Check ray against BVH node
|
||||
LinearBVHNode node = nodes[nodeNum];
|
||||
if (any(BBoxIntersect(node.bounds, ray))) {
|
||||
uniform unsigned int nPrimitives = nPrims(node);
|
||||
uniform unsigned int nPrimitives = node.nPrimitives;
|
||||
if (nPrimitives > 0) {
|
||||
// Intersect ray with primitives in leaf BVH node
|
||||
uniform unsigned int primitivesOffset = node.offset;
|
||||
@@ -213,7 +203,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
}
|
||||
else {
|
||||
// Put far BVH node on _todo_ stack, advance to near node
|
||||
if (r.dirIsNeg[axis(node)]) {
|
||||
if (r.dirIsNeg[node.splitAxis]) {
|
||||
todo[todoOffset++] = nodeNum + 1;
|
||||
nodeNum = node.offset;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -81,6 +81,8 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -94,6 +96,8 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -109,6 +113,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -126,6 +131,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -138,18 +144,18 @@
|
||||
<CustomBuild Include="rt.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
|
||||
// Just enough of a float3 class to do what we need in this file.
|
||||
#ifdef _MSC_VER
|
||||
@@ -75,30 +76,20 @@ struct Ray {
|
||||
namespace ispc {
|
||||
struct Triangle {
|
||||
float3 p[3];
|
||||
int id;
|
||||
int32_t id;
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
float3 bounds[2];
|
||||
unsigned int offset; // primitives for leaf, second child for interior
|
||||
unsigned int primsAxis; // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
|
||||
int32_t offset; // primitives for leaf, second child for interior
|
||||
uint8_t nPrimitives;
|
||||
uint8_t splitAxis;
|
||||
uint16_t pad;
|
||||
};
|
||||
}
|
||||
|
||||
using namespace ispc;
|
||||
|
||||
inline int nPrims(const LinearBVHNode &node) {
|
||||
return (node.primsAxis & 0xff);
|
||||
}
|
||||
|
||||
inline int axis(const LinearBVHNode &node) {
|
||||
return ((node.primsAxis >> 8) & 0xff);
|
||||
}
|
||||
|
||||
inline bool isInterior(const LinearBVHNode &node) {
|
||||
return nPrims(node) == 0;
|
||||
}
|
||||
|
||||
inline float3 Cross(const float3 &v1, const float3 &v2) {
|
||||
float v1x = v1.x, v1y = v1.y, v1z = v1.z;
|
||||
float v2x = v2.x, v2y = v2.y, v2z = v2.z;
|
||||
@@ -230,7 +221,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
// Check ray against BVH node
|
||||
const LinearBVHNode &node = nodes[nodeNum];
|
||||
if (BBoxIntersect(node.bounds, ray)) {
|
||||
unsigned int nPrimitives = nPrims(node);
|
||||
unsigned int nPrimitives = node.nPrimitives;
|
||||
if (nPrimitives > 0) {
|
||||
// Intersect ray with primitives in leaf BVH node
|
||||
unsigned int primitivesOffset = node.offset;
|
||||
@@ -244,7 +235,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
}
|
||||
else {
|
||||
// Put far BVH node on _todo_ stack, advance to near node
|
||||
if (r.dirIsNeg[axis(node)]) {
|
||||
if (r.dirIsNeg[node.splitAxis]) {
|
||||
todo[todoOffset++] = nodeNum + 1;
|
||||
nodeNum = node.offset;
|
||||
}
|
||||
|
||||
@@ -32,12 +32,48 @@
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "../cpuid.h"
|
||||
|
||||
// Include the header file that the ispc compiler generates
|
||||
#include "simple_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
float vin[16], vout[16];
|
||||
|
||||
// Initialize input buffer
|
||||
|
||||
@@ -25,18 +25,18 @@
|
||||
<CustomBuild Include="simple.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
cl /E /TP %(Filename).ispc | ispc -O2 -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
|
||||
87
expr.h
87
expr.h
@@ -39,6 +39,7 @@
|
||||
#define ISPC_EXPR_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include "type.h"
|
||||
|
||||
class FunctionSymbolExpr;
|
||||
|
||||
@@ -96,7 +97,7 @@ public:
|
||||
that incorporates the given error message string. In either
|
||||
failure case, NULL is returned. */
|
||||
Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL,
|
||||
bool failureOk = false);
|
||||
bool failureOk = false, bool issuePrecisionWarnings = true);
|
||||
};
|
||||
|
||||
|
||||
@@ -291,23 +292,28 @@ private:
|
||||
|
||||
|
||||
/** @brief Expression representing member selection ("foo.bar").
|
||||
*
|
||||
* This will also be overloaded to deal with swizzles.
|
||||
*/
|
||||
class MemberExpr : public Expr {
|
||||
public:
|
||||
static MemberExpr* create(Expr *expr, const char *identifier,
|
||||
SourcePos pos, SourcePos identifierPos);
|
||||
|
||||
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
||||
SourcePos identifierPos);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
void Print() const;
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||
virtual const Type *GetType() const;
|
||||
virtual Symbol *GetBaseSymbol() const;
|
||||
virtual void Print() const;
|
||||
virtual Expr *Optimize();
|
||||
virtual Expr *TypeCheck();
|
||||
virtual int getElementNumber() const;
|
||||
|
||||
private:
|
||||
protected:
|
||||
std::string getCandidateNearMatches() const;
|
||||
int getElementNumber() const;
|
||||
|
||||
Expr *expr;
|
||||
std::string identifier;
|
||||
@@ -318,12 +324,30 @@ private:
|
||||
/** @brief Expression representing a compile-time constant value.
|
||||
|
||||
This class can currently represent compile-time constants of anything
|
||||
that is an AtomicType; for anything more complex, we don't currently
|
||||
have a representation of a compile-time constant that can be further
|
||||
reasoned about.
|
||||
that is an AtomicType or an EnumType; for anything more complex, we
|
||||
don't currently have a representation of a compile-time constant that
|
||||
can be further reasoned about.
|
||||
*/
|
||||
class ConstExpr : public Expr {
|
||||
public:
|
||||
/** Create a ConstExpr from a uniform int8 value */
|
||||
ConstExpr(const Type *t, int8_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying int8 value */
|
||||
ConstExpr(const Type *t, int8_t *i, SourcePos p);
|
||||
/** Create a ConstExpr from a uniform uint8 value */
|
||||
ConstExpr(const Type *t, uint8_t u, SourcePos p);
|
||||
/** Create a ConstExpr from a varying uint8 value */
|
||||
ConstExpr(const Type *t, uint8_t *u, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform int16 value */
|
||||
ConstExpr(const Type *t, int16_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying int16 value */
|
||||
ConstExpr(const Type *t, int16_t *i, SourcePos p);
|
||||
/** Create a ConstExpr from a uniform uint16 value */
|
||||
ConstExpr(const Type *t, uint16_t u, SourcePos p);
|
||||
/** Create a ConstExpr from a varying uint16 value */
|
||||
ConstExpr(const Type *t, uint16_t *u, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform int32 value */
|
||||
ConstExpr(const Type *t, int32_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying int32 value */
|
||||
@@ -332,14 +356,17 @@ public:
|
||||
ConstExpr(const Type *t, uint32_t u, SourcePos p);
|
||||
/** Create a ConstExpr from a varying uint32 value */
|
||||
ConstExpr(const Type *t, uint32_t *u, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform float value */
|
||||
ConstExpr(const Type *t, float f, SourcePos p);
|
||||
/** Create a ConstExpr from a varying float value */
|
||||
ConstExpr(const Type *t, float *f, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform double value */
|
||||
ConstExpr(const Type *t, double d, SourcePos p);
|
||||
/** Create a ConstExpr from a varying double value */
|
||||
ConstExpr(const Type *t, double *d, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform int64 value */
|
||||
ConstExpr(const Type *t, int64_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying int64 value */
|
||||
@@ -348,10 +375,12 @@ public:
|
||||
ConstExpr(const Type *t, uint64_t i, SourcePos p);
|
||||
/** Create a ConstExpr from a varying uint64 value */
|
||||
ConstExpr(const Type *t, uint64_t *i, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr from a uniform bool value */
|
||||
ConstExpr(const Type *t, bool b, SourcePos p);
|
||||
/** Create a ConstExpr from a varying bool value */
|
||||
ConstExpr(const Type *t, bool *b, SourcePos p);
|
||||
|
||||
/** Create a ConstExpr of the same type as the given old ConstExpr,
|
||||
with values given by the "vales" parameter. */
|
||||
ConstExpr(ConstExpr *old, double *values);
|
||||
@@ -370,6 +399,30 @@ public:
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsBool(bool *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as int8s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsInt8(int8_t *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as uint8s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsUInt8(uint8_t *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as int16s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsInt16(int16_t *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as uint16s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
equal to the target vector width into the given pointer. */
|
||||
int AsUInt16(uint16_t *, bool forceVarying = false) const;
|
||||
|
||||
/** Return the ConstExpr's values as int32s, doing type conversion
|
||||
from the actual type if needed. If forceVarying is true, then type
|
||||
convert to 'varying' so as to always return a number of values
|
||||
@@ -412,8 +465,14 @@ public:
|
||||
int Count() const;
|
||||
|
||||
private:
|
||||
const AtomicType *type;
|
||||
AtomicType::BasicType getBasicType() const;
|
||||
|
||||
const Type *type;
|
||||
union {
|
||||
int8_t int8Val[ISPC_MAX_NVEC];
|
||||
uint8_t uint8Val[ISPC_MAX_NVEC];
|
||||
int16_t int16Val[ISPC_MAX_NVEC];
|
||||
uint16_t uint16Val[ISPC_MAX_NVEC];
|
||||
int32_t int32Val[ISPC_MAX_NVEC];
|
||||
uint32_t uint32Val[ISPC_MAX_NVEC];
|
||||
bool boolVal[ISPC_MAX_NVEC];
|
||||
|
||||
16
failing_tests/shuffle2-10.ispc
Normal file
16
failing_tests/shuffle2-10.ispc
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
/* failing due to llvm bug http://llvm.org/bugs/show_bug.cgi?id=10421 */
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int8 aa = aFOO[programIndex];
|
||||
int8 bb = aa + programCount;
|
||||
int8 shuf = shuffle(aa, bb, 2*programIndex+(int)b-5);
|
||||
//CO print("%\n%\n%\n%\n", aa, bb, 2*programIndex+(int)b-5, shuf);
|
||||
RET[programIndex] = shuf;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + 2*programIndex;
|
||||
}
|
||||
11
ispc.cpp
11
ispc.cpp
@@ -135,3 +135,14 @@ SourcePos::Print() const {
|
||||
printf(" @ [%s:%d.%d - %d.%d] ", name, first_line, first_column,
|
||||
last_line, last_column);
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
SourcePos::operator==(const SourcePos &p2) const {
|
||||
return (!strcmp(name, p2.name) &&
|
||||
first_line == p2.first_line &&
|
||||
first_column == p2.first_column &&
|
||||
last_line == p2.last_line &&
|
||||
last_column == p2.last_column);
|
||||
}
|
||||
|
||||
|
||||
11
ispc.h
11
ispc.h
@@ -73,6 +73,13 @@ namespace llvm {
|
||||
class Value;
|
||||
}
|
||||
|
||||
// llvm::Type *s are no longer const in llvm 3.0
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
#define LLVM_TYPE_CONST
|
||||
#else
|
||||
#define LLVM_TYPE_CONST const
|
||||
#endif
|
||||
|
||||
class ArrayType;
|
||||
class AtomicType;
|
||||
class DeclSpecs;
|
||||
@@ -110,6 +117,8 @@ struct SourcePos {
|
||||
|
||||
/** Returns a LLVM DIFile object that represents the SourcePos's file */
|
||||
llvm::DIFile GetDIFile() const;
|
||||
|
||||
bool operator==(const SourcePos &p2) const;
|
||||
};
|
||||
|
||||
|
||||
@@ -149,7 +158,7 @@ public:
|
||||
struct Target {
|
||||
Target();
|
||||
|
||||
/** Enumerant giving the instruction sets that the compiler can
|
||||
/** Enumerator giving the instruction sets that the compiler can
|
||||
target. */
|
||||
enum ISA { SSE2, SSE4, AVX };
|
||||
|
||||
|
||||
58
ispc.vcxproj
58
ispc.vcxproj
@@ -28,11 +28,11 @@
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="opt.cpp" />
|
||||
<ClCompile Include="parse.cc" />
|
||||
<CustomBuild Include="stdlib-c.c">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c > gen-bitcode-c.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang stdlib-c.c</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c > gen-bitcode-c.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang stdlib-c.c</Message>
|
||||
<CustomBuild Include="builtins-c.c">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c.c > gen-bitcode-c.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c.c > gen-bitcode-c.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
|
||||
</CustomBuild>
|
||||
@@ -59,62 +59,62 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib-sse4.ll">
|
||||
<CustomBuild Include="builtins-sse4.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib-sse4x2.ll">
|
||||
<CustomBuild Include="builtins-sse4x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib-sse2.ll">
|
||||
<CustomBuild Include="builtins-sse2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib-avx.ll">
|
||||
<CustomBuild Include="builtins-avx.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
</CustomBuild>
|
||||
@@ -187,7 +187,7 @@
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
@@ -207,7 +207,7 @@
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -33,6 +33,10 @@
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -77,6 +81,8 @@ extern "C" {
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *, void *);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
}
|
||||
|
||||
void ISPCLaunch(void *func, void *data) {
|
||||
@@ -89,6 +95,18 @@ void ISPCLaunch(void *func, void *data) {
|
||||
void ISPCSync() {
|
||||
}
|
||||
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
return _aligned_malloc(size, alignment);
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void usage(int ret) {
|
||||
fprintf(stderr, "usage: ispc_test\n");
|
||||
fprintf(stderr, "\t[-h/--help]\tprint help\n");
|
||||
@@ -140,32 +158,40 @@ static bool lRunTest(const char *fn) {
|
||||
}
|
||||
|
||||
llvm::Function *func;
|
||||
if ((func = module->getFunction("ISPCLaunch")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)ISPCLaunch);
|
||||
if ((func = module->getFunction("ISPCSync")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)ISPCSync);
|
||||
if ((func = module->getFunction("putchar")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)putchar);
|
||||
if ((func = module->getFunction("printf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)printf);
|
||||
if ((func = module->getFunction("fflush")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)fflush);
|
||||
if ((func = module->getFunction("sinf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)sinf);
|
||||
if ((func = module->getFunction("cosf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)cosf);
|
||||
if ((func = module->getFunction("tanf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)tanf);
|
||||
if ((func = module->getFunction("atanf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)atanf);
|
||||
if ((func = module->getFunction("atan2f")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)atan2f);
|
||||
if ((func = module->getFunction("powf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)powf);
|
||||
if ((func = module->getFunction("expf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)expf);
|
||||
if ((func = module->getFunction("logf")) != NULL)
|
||||
ee->addGlobalMapping(func, (void *)logf);
|
||||
#define DO_FUNC(FUNC ,FUNCNAME) \
|
||||
if ((func = module->getFunction(FUNCNAME)) != NULL) \
|
||||
ee->addGlobalMapping(func, (void *)FUNC)
|
||||
DO_FUNC(ISPCLaunch, "ISPCLaunch");
|
||||
DO_FUNC(ISPCSync, "ISPCSync");
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
DO_FUNC(ISPCMalloc, "ISPCMalloc");
|
||||
DO_FUNC(ISPCFree, "ISPCFree");
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
DO_FUNC(putchar, "putchar");
|
||||
DO_FUNC(printf, "printf");
|
||||
DO_FUNC(fflush, "fflush");
|
||||
DO_FUNC(sinf, "sinf");
|
||||
DO_FUNC(cosf, "cosf");
|
||||
DO_FUNC(tanf, "tanf");
|
||||
DO_FUNC(atanf, "atanf");
|
||||
DO_FUNC(atan2f, "atan2f");
|
||||
DO_FUNC(powf, "powf");
|
||||
DO_FUNC(expf, "expf");
|
||||
DO_FUNC(logf, "logf");
|
||||
DO_FUNC(sin, "sin");
|
||||
DO_FUNC(cos, "cos");
|
||||
DO_FUNC(tan, "tan");
|
||||
DO_FUNC(atan, "atan");
|
||||
DO_FUNC(atan2, "atan2");
|
||||
DO_FUNC(pow, "pow");
|
||||
DO_FUNC(exp, "exp");
|
||||
DO_FUNC(log, "log");
|
||||
DO_FUNC(memset, "memset");
|
||||
#ifdef ISPC_IS_APPLE
|
||||
DO_FUNC(memset_pattern4, "memset_pattern4");
|
||||
DO_FUNC(memset_pattern8, "memset_pattern8");
|
||||
DO_FUNC(memset_pattern16, "memset_pattern16");
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_HAVE_SVML
|
||||
#define DO_SVML(FUNC ,FUNCNAME) \
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -70,7 +70,7 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
|
||||
160
lex.ll
160
lex.ll
@@ -35,16 +35,19 @@
|
||||
|
||||
#include "ispc.h"
|
||||
#include "decl.h"
|
||||
#include "parse.hh"
|
||||
#include "sym.h"
|
||||
#include "util.h"
|
||||
#include "module.h"
|
||||
#include "type.h"
|
||||
#include "parse.hh"
|
||||
#include <stdlib.h>
|
||||
|
||||
static uint32_t lParseBinary(const char *ptr, SourcePos pos);
|
||||
static uint64_t lParseBinary(const char *ptr, SourcePos pos);
|
||||
static void lCComment(SourcePos *);
|
||||
static void lCppComment(SourcePos *);
|
||||
static void lHandleCppHash(SourcePos *);
|
||||
static void lStringConst(YYSTYPE *, SourcePos *);
|
||||
static double lParseHexFloat(const char *ptr);
|
||||
|
||||
#define YY_USER_ACTION \
|
||||
yylloc->first_line = yylloc->last_line; \
|
||||
@@ -65,9 +68,11 @@ inline int isatty(int) { return 0; }
|
||||
|
||||
WHITESPACE [ \t\r]+
|
||||
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
|
||||
FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)|([-]?0x[01]\.?[0-9a-fA-F]+p[-+]?[0-9]+[fF]?)
|
||||
FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
|
||||
HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
|
||||
|
||||
IDENT [a-zA-Z_][a-zA-Z_0-9]*
|
||||
ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
|
||||
|
||||
%%
|
||||
"/*" { lCComment(yylloc); }
|
||||
@@ -100,6 +105,8 @@ goto { return TOKEN_GOTO; }
|
||||
if { return TOKEN_IF; }
|
||||
inline { return TOKEN_INLINE; }
|
||||
int { return TOKEN_INT; }
|
||||
int8 { return TOKEN_INT8; }
|
||||
int16 { return TOKEN_INT16; }
|
||||
int32 { return TOKEN_INT; }
|
||||
int64 { return TOKEN_INT64; }
|
||||
launch { return TOKEN_LAUNCH; }
|
||||
@@ -134,61 +141,66 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
||||
|
||||
{INT_NUMBER} {
|
||||
char *endPtr = NULL;
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
unsigned long val;
|
||||
#else
|
||||
unsigned long long val;
|
||||
#endif
|
||||
int64_t val;
|
||||
|
||||
if (yytext[0] == '0' && yytext[1] == 'b')
|
||||
val = lParseBinary(yytext+2, *yylloc);
|
||||
else {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
val = strtoul(yytext, &endPtr, 0);
|
||||
val = _strtoi64(yytext, &endPtr, 0);
|
||||
#else
|
||||
// FIXME: should use strtouq and then issue an error if we can't
|
||||
// fit into 64 bits...
|
||||
val = strtoull(yytext, &endPtr, 0);
|
||||
#endif
|
||||
}
|
||||
yylval->int32Val = (int32_t)val;
|
||||
if (val != (unsigned int)yylval->int32Val)
|
||||
Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)",
|
||||
yytext, yylval->int32Val, (unsigned long long)val);
|
||||
return TOKEN_INT_CONSTANT;
|
||||
|
||||
// See if we can fit this into a 32-bit integer...
|
||||
if ((val & 0xffffffff) == val) {
|
||||
yylval->int32Val = (int32_t)val;
|
||||
return TOKEN_INT32_CONSTANT;
|
||||
}
|
||||
else {
|
||||
yylval->int64Val = val;
|
||||
return TOKEN_INT64_CONSTANT;
|
||||
}
|
||||
}
|
||||
|
||||
{INT_NUMBER}[uU] {
|
||||
char *endPtr = NULL;
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
unsigned long val;
|
||||
#else
|
||||
unsigned long long val;
|
||||
#endif
|
||||
uint64_t val;
|
||||
|
||||
if (yytext[0] == '0' && yytext[1] == 'b')
|
||||
val = lParseBinary(yytext+2, *yylloc);
|
||||
else {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
val = strtoul(yytext, &endPtr, 0);
|
||||
val = _strtoui64(yytext, &endPtr, 0);
|
||||
#else
|
||||
val = strtoull(yytext, &endPtr, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
yylval->int32Val = (int32_t)val;
|
||||
if (val != (unsigned int)yylval->int32Val)
|
||||
Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)",
|
||||
yytext, yylval->int32Val, (unsigned long long)val);
|
||||
return TOKEN_UINT_CONSTANT;
|
||||
if ((val & 0xffffffff) == val) {
|
||||
// we can represent it in a 32-bit value
|
||||
yylval->int32Val = (int32_t)val;
|
||||
return TOKEN_UINT32_CONSTANT;
|
||||
}
|
||||
else {
|
||||
yylval->int64Val = val;
|
||||
return TOKEN_UINT64_CONSTANT;
|
||||
}
|
||||
}
|
||||
|
||||
{FLOAT_NUMBER} {
|
||||
/* FIXME: need to implement a hex float constant parser so that we can
|
||||
support them on Windows (which doesn't handle them in its atof()
|
||||
implementation... */
|
||||
yylval->floatVal = atof(yytext);
|
||||
return TOKEN_FLOAT_CONSTANT;
|
||||
}
|
||||
|
||||
{HEX_FLOAT_NUMBER} {
|
||||
yylval->floatVal = lParseHexFloat(yytext);
|
||||
return TOKEN_FLOAT_CONSTANT;
|
||||
}
|
||||
|
||||
"++" { return TOKEN_INC_OP; }
|
||||
"--" { return TOKEN_DEC_OP; }
|
||||
"<<" { return TOKEN_LEFT_OP; }
|
||||
@@ -264,19 +276,18 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
||||
|
||||
/** Return the integer version of a binary constant from a string.
|
||||
*/
|
||||
static uint32_t
|
||||
static uint64_t
|
||||
lParseBinary(const char *ptr, SourcePos pos) {
|
||||
uint32_t val = 0;
|
||||
uint64_t val = 0;
|
||||
bool warned = false;
|
||||
|
||||
while (*ptr != '\0') {
|
||||
/* if this hits, the regexp for 0b... constants is broken */
|
||||
assert(*ptr == '0' || *ptr == '1');
|
||||
|
||||
if ((val & (1<<31)) && warned == false) {
|
||||
if ((val & (((int64_t)1)<<63)) && warned == false) {
|
||||
// We're about to shift out a set bit
|
||||
// FIXME: 64-bit int constants...
|
||||
Warning(pos, "Can't represent binary constant with 32-bit integer type");
|
||||
Warning(pos, "Can't represent binary constant with a 64-bit integer type");
|
||||
warned = true;
|
||||
}
|
||||
|
||||
@@ -389,12 +400,12 @@ lEscapeChar(char *str, char *pChar, SourcePos *pos)
|
||||
// octal constants \012
|
||||
case '0': case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7':
|
||||
*pChar = strtol(str, &tail, 8);
|
||||
*pChar = (char)strtol(str, &tail, 8);
|
||||
str = tail - 1;
|
||||
break;
|
||||
// hexidecimal constant \xff
|
||||
case 'x':
|
||||
*pChar = strtol(str, &tail, 16);
|
||||
*pChar = (char)strtol(str, &tail, 16);
|
||||
str = tail - 1;
|
||||
break;
|
||||
default:
|
||||
@@ -424,3 +435,82 @@ lStringConst(YYSTYPE *yylval, SourcePos *pos)
|
||||
}
|
||||
yylval->stringVal = new std::string(str);
|
||||
}
|
||||
|
||||
|
||||
/** Compute the value 2^n, where the exponent is given as an integer.
|
||||
There are more efficient ways to do this, for example by just slamming
|
||||
the bits into the appropriate bits of the double, but let's just do the
|
||||
obvious thing.
|
||||
*/
|
||||
static double
|
||||
ipow2(int exponent) {
|
||||
if (exponent < 0)
|
||||
return 1. / ipow2(-exponent);
|
||||
|
||||
double ret = 1.;
|
||||
while (exponent > 16) {
|
||||
ret *= 65536.;
|
||||
exponent -= 16;
|
||||
}
|
||||
while (exponent-- > 0)
|
||||
ret *= 2.;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/** Parse a hexadecimal-formatted floating-point number (C99 hex float
|
||||
constant-style).
|
||||
*/
|
||||
static double
|
||||
lParseHexFloat(const char *ptr) {
|
||||
assert(ptr != NULL);
|
||||
|
||||
assert(ptr[0] == '0' && ptr[1] == 'x');
|
||||
ptr += 2;
|
||||
|
||||
// Start initializing the mantissa
|
||||
assert(*ptr == '0' || *ptr == '1');
|
||||
double mantissa = (*ptr == '1') ? 1. : 0.;
|
||||
++ptr;
|
||||
|
||||
if (*ptr == '.') {
|
||||
// Is there a fraction part? If so, the i'th digit we encounter
|
||||
// gives the 1/(16^i) component of the mantissa.
|
||||
++ptr;
|
||||
|
||||
double scale = 1. / 16.;
|
||||
// Keep going until we come to the 'p', which indicates that we've
|
||||
// come to the exponent
|
||||
while (*ptr != 'p') {
|
||||
// Figure out the raw value from 0-15
|
||||
int digit;
|
||||
if (*ptr >= '0' && *ptr <= '9')
|
||||
digit = *ptr - '0';
|
||||
else if (*ptr >= 'a' && *ptr <= 'f')
|
||||
digit = 10 + *ptr - 'a';
|
||||
else {
|
||||
assert(*ptr >= 'A' && *ptr <= 'F');
|
||||
digit = 10 + *ptr - 'A';
|
||||
}
|
||||
|
||||
// And add its contribution to the mantissa
|
||||
mantissa += scale * digit;
|
||||
scale /= 16.;
|
||||
++ptr;
|
||||
}
|
||||
}
|
||||
else
|
||||
// If there's not a '.', then we better be going straight to the
|
||||
// exponent
|
||||
assert(*ptr == 'p');
|
||||
|
||||
++ptr; // skip the 'p'
|
||||
|
||||
// interestingly enough, the exponent is provided base 10..
|
||||
int exponent = (int)strtol(ptr, (char **)NULL, 10);
|
||||
|
||||
// Does stdlib exp2() guarantee exact results for integer n where can
|
||||
// be represented exactly as doubles? I would hope so but am not sure,
|
||||
// so let's be sure.
|
||||
return mantissa * ipow2(exponent);
|
||||
}
|
||||
|
||||
196
llvmutil.cpp
196
llvmutil.cpp
@@ -38,30 +38,43 @@
|
||||
#include "llvmutil.h"
|
||||
#include "type.h"
|
||||
|
||||
const llvm::Type *LLVMTypes::VoidType = NULL;
|
||||
const llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
|
||||
const llvm::Type *LLVMTypes::BoolType = NULL;
|
||||
const llvm::Type *LLVMTypes::Int8Type = NULL;
|
||||
const llvm::Type *LLVMTypes::Int16Type = NULL;
|
||||
const llvm::Type *LLVMTypes::Int32Type = NULL;
|
||||
const llvm::Type *LLVMTypes::Int32PointerType = NULL;
|
||||
const llvm::Type *LLVMTypes::Int64Type = NULL;
|
||||
const llvm::Type *LLVMTypes::Int64PointerType = NULL;
|
||||
const llvm::Type *LLVMTypes::FloatType = NULL;
|
||||
const llvm::Type *LLVMTypes::FloatPointerType = NULL;
|
||||
const llvm::Type *LLVMTypes::DoubleType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
|
||||
LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
|
||||
|
||||
const llvm::VectorType *LLVMTypes::MaskType = NULL;
|
||||
const llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
|
||||
const llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
|
||||
const llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
|
||||
const llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
|
||||
const llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
|
||||
const llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
|
||||
const llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
|
||||
const llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
|
||||
const llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
|
||||
const llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16Type = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32Type = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64Type = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoublePointerType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::MaskType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
|
||||
|
||||
llvm::Constant *LLVMTrue = NULL;
|
||||
llvm::Constant *LLVMFalse = NULL;
|
||||
@@ -73,17 +86,22 @@ void
|
||||
InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
|
||||
LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
|
||||
|
||||
LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
|
||||
LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
|
||||
LLVMTypes::Int16Type = llvm::Type::getInt16Ty(*ctx);
|
||||
LLVMTypes::Int32Type = llvm::Type::getInt32Ty(*ctx);
|
||||
LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
|
||||
LLVMTypes::Int64Type = llvm::Type::getInt64Ty(*ctx);
|
||||
LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
|
||||
LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
|
||||
LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
|
||||
LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);
|
||||
|
||||
LLVMTypes::Int8PointerType = llvm::PointerType::get(LLVMTypes::Int8Type, 0);
|
||||
LLVMTypes::Int16PointerType = llvm::PointerType::get(LLVMTypes::Int16Type, 0);
|
||||
LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
|
||||
LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
|
||||
LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
|
||||
LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
|
||||
|
||||
// Note that both the mask and bool vectors are vector of int32s
|
||||
// (not i1s). LLVM ends up generating much better SSE code with
|
||||
// this representation.
|
||||
@@ -92,17 +110,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
|
||||
LLVMTypes::Int1VectorType =
|
||||
llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
|
||||
LLVMTypes::Int8VectorType =
|
||||
llvm::VectorType::get(LLVMTypes::Int8Type, target.vectorWidth);
|
||||
LLVMTypes::Int16VectorType =
|
||||
llvm::VectorType::get(LLVMTypes::Int16Type, target.vectorWidth);
|
||||
LLVMTypes::Int32VectorType =
|
||||
llvm::VectorType::get(LLVMTypes::Int32Type, target.vectorWidth);
|
||||
LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
|
||||
LLVMTypes::Int64VectorType =
|
||||
llvm::VectorType::get(LLVMTypes::Int64Type, target.vectorWidth);
|
||||
LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
|
||||
LLVMTypes::FloatVectorType =
|
||||
llvm::VectorType::get(LLVMTypes::FloatType, target.vectorWidth);
|
||||
LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
|
||||
LLVMTypes::DoubleVectorType =
|
||||
llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
|
||||
|
||||
LLVMTypes::Int8VectorPointerType = llvm::PointerType::get(LLVMTypes::Int8VectorType, 0);
|
||||
LLVMTypes::Int16VectorPointerType = llvm::PointerType::get(LLVMTypes::Int16VectorType, 0);
|
||||
LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
|
||||
LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
|
||||
LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
|
||||
LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
|
||||
|
||||
LLVMTypes::VoidPointerVectorType =
|
||||
llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
|
||||
|
||||
@@ -129,7 +156,36 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *LLVMInt32(int32_t ival) {
|
||||
llvm::ConstantInt *
|
||||
LLVMInt8(int8_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
|
||||
true /*signed*/);
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *
|
||||
LLVMUInt8(uint8_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
|
||||
false /*unsigned*/);
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *
|
||||
LLVMInt16(int16_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
|
||||
true /*signed*/);
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *
|
||||
LLVMUInt16(uint16_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
|
||||
false /*unsigned*/);
|
||||
}
|
||||
|
||||
|
||||
llvm::ConstantInt *
|
||||
LLVMInt32(int32_t ival) {
|
||||
return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
|
||||
true /*signed*/);
|
||||
}
|
||||
@@ -168,6 +224,82 @@ LLVMDouble(double dval) {
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt8Vector(int8_t ival) {
|
||||
llvm::Constant *v = LLVMInt8(ival);
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(v);
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt8Vector(const int8_t *ivec) {
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(LLVMInt8(ivec[i]));
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMUInt8Vector(uint8_t ival) {
|
||||
llvm::Constant *v = LLVMUInt8(ival);
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(v);
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMUInt8Vector(const uint8_t *ivec) {
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(LLVMUInt8(ivec[i]));
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt16Vector(int16_t ival) {
|
||||
llvm::Constant *v = LLVMInt16(ival);
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(v);
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt16Vector(const int16_t *ivec) {
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(LLVMInt16(ivec[i]));
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMUInt16Vector(uint16_t ival) {
|
||||
llvm::Constant *v = LLVMUInt16(ival);
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(v);
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMUInt16Vector(const uint16_t *ivec) {
|
||||
std::vector<llvm::Constant *> vals;
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
vals.push_back(LLVMUInt16(ivec[i]));
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
llvm::Constant *
|
||||
LLVMInt32Vector(int32_t ival) {
|
||||
llvm::Constant *v = LLVMInt32(ival);
|
||||
@@ -321,8 +453,8 @@ LLVMBoolVector(const bool *bvec) {
|
||||
}
|
||||
|
||||
|
||||
const llvm::ArrayType *
|
||||
LLVMPointerVectorType(const llvm::Type *t) {
|
||||
LLVM_TYPE_CONST llvm::ArrayType *
|
||||
LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t) {
|
||||
// NOTE: ArrayType, not VectorType
|
||||
return llvm::ArrayType::get(llvm::PointerType::get(t, 0),
|
||||
g->target.vectorWidth);
|
||||
|
||||
104
llvmutil.h
104
llvmutil.h
@@ -44,35 +44,49 @@
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Constants.h>
|
||||
|
||||
|
||||
/** This structure holds pointers to a variety of LLVM types; code
|
||||
elsewhere can use them from here, ratherthan needing to make more
|
||||
verbose LLVM API calls.
|
||||
*/
|
||||
struct LLVMTypes {
|
||||
static const llvm::Type *VoidType;
|
||||
static const llvm::PointerType *VoidPointerType;
|
||||
static const llvm::Type *BoolType;
|
||||
static const llvm::Type *Int8Type;
|
||||
static const llvm::Type *Int16Type;
|
||||
static const llvm::Type *Int32Type;
|
||||
static const llvm::Type *Int32PointerType;
|
||||
static const llvm::Type *Int64Type;
|
||||
static const llvm::Type *Int64PointerType;
|
||||
static const llvm::Type *FloatType;
|
||||
static const llvm::Type *FloatPointerType;
|
||||
static const llvm::Type *DoubleType;
|
||||
static LLVM_TYPE_CONST llvm::Type *VoidType;
|
||||
static LLVM_TYPE_CONST llvm::PointerType *VoidPointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *BoolType;
|
||||
|
||||
static const llvm::VectorType *MaskType;
|
||||
static const llvm::VectorType *BoolVectorType;
|
||||
static const llvm::VectorType *Int1VectorType;
|
||||
static const llvm::VectorType *Int32VectorType;
|
||||
static const llvm::Type *Int32VectorPointerType;
|
||||
static const llvm::VectorType *Int64VectorType;
|
||||
static const llvm::Type *Int64VectorPointerType;
|
||||
static const llvm::VectorType *FloatVectorType;
|
||||
static const llvm::Type *FloatVectorPointerType;
|
||||
static const llvm::VectorType *DoubleVectorType;
|
||||
static const llvm::ArrayType *VoidPointerVectorType;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int8Type;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int16Type;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int32Type;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int64Type;
|
||||
static LLVM_TYPE_CONST llvm::Type *FloatType;
|
||||
static LLVM_TYPE_CONST llvm::Type *DoubleType;
|
||||
|
||||
static LLVM_TYPE_CONST llvm::Type *Int8PointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int16PointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int32PointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int64PointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *FloatPointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *DoublePointerType;
|
||||
|
||||
static LLVM_TYPE_CONST llvm::VectorType *MaskType;
|
||||
|
||||
static LLVM_TYPE_CONST llvm::VectorType *BoolVectorType;
|
||||
static LLVM_TYPE_CONST llvm::VectorType *Int1VectorType;
|
||||
static LLVM_TYPE_CONST llvm::VectorType *Int8VectorType;
|
||||
static LLVM_TYPE_CONST llvm::VectorType *Int16VectorType;
|
||||
static LLVM_TYPE_CONST llvm::VectorType *Int32VectorType;
|
||||
static LLVM_TYPE_CONST llvm::VectorType *Int64VectorType;
|
||||
static LLVM_TYPE_CONST llvm::VectorType *FloatVectorType;
|
||||
static LLVM_TYPE_CONST llvm::VectorType *DoubleVectorType;
|
||||
|
||||
static LLVM_TYPE_CONST llvm::Type *Int8VectorPointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int16VectorPointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int32VectorPointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *Int64VectorPointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
|
||||
static LLVM_TYPE_CONST llvm::Type *DoubleVectorPointerType;
|
||||
|
||||
static LLVM_TYPE_CONST llvm::ArrayType *VoidPointerVectorType;
|
||||
};
|
||||
|
||||
/** These variables hold the corresponding LLVM constant values as a
|
||||
@@ -86,6 +100,14 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
|
||||
*/
|
||||
extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);
|
||||
|
||||
/** Returns an LLVM i8 constant of the given value */
|
||||
extern llvm::ConstantInt *LLVMInt8(int8_t i);
|
||||
/** Returns an LLVM i8 constant of the given value */
|
||||
extern llvm::ConstantInt *LLVMUInt8(uint8_t i);
|
||||
/** Returns an LLVM i16 constant of the given value */
|
||||
extern llvm::ConstantInt *LLVMInt16(int16_t i);
|
||||
/** Returns an LLVM i16 constant of the given value */
|
||||
extern llvm::ConstantInt *LLVMUInt16(uint16_t i);
|
||||
/** Returns an LLVM i32 constant of the given value */
|
||||
extern llvm::ConstantInt *LLVMInt32(int32_t i);
|
||||
/** Returns an LLVM i32 constant of the given value */
|
||||
@@ -102,18 +124,35 @@ extern llvm::Constant *LLVMDouble(double f);
|
||||
/** Returns an LLVM boolean vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMBoolVector(bool v);
|
||||
|
||||
/** Returns an LLVM i8 vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMInt8Vector(int8_t i);
|
||||
/** Returns an LLVM i8 vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMUInt8Vector(uint8_t i);
|
||||
|
||||
/** Returns an LLVM i16 vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMInt16Vector(int16_t i);
|
||||
/** Returns an LLVM i16 vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMUInt16Vector(uint16_t i);
|
||||
|
||||
/** Returns an LLVM i32 vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMInt32Vector(int32_t i);
|
||||
/** Returns an LLVM i32 vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMUInt32Vector(uint32_t i);
|
||||
|
||||
/** Returns an LLVM i64 vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMInt64Vector(int64_t i);
|
||||
/** Returns an LLVM i64 vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMUInt64Vector(uint64_t i);
|
||||
|
||||
/** Returns an LLVM float vector constant of the given value smeared
|
||||
across all elements */
|
||||
extern llvm::Constant *LLVMFloatVector(float f);
|
||||
@@ -124,18 +163,35 @@ extern llvm::Constant *LLVMDoubleVector(double f);
|
||||
/** Returns an LLVM boolean vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMBoolVector(const bool *v);
|
||||
|
||||
/** Returns an LLVM i8 vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMInt8Vector(const int8_t *i);
|
||||
/** Returns an LLVM i8 vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMUInt8Vector(const uint8_t *i);
|
||||
|
||||
/** Returns an LLVM i16 vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMInt16Vector(const int16_t *i);
|
||||
/** Returns an LLVM i16 vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMUInt16Vector(const uint16_t *i);
|
||||
|
||||
/** Returns an LLVM i32 vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMInt32Vector(const int32_t *i);
|
||||
/** Returns an LLVM i32 vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMUInt32Vector(const uint32_t *i);
|
||||
|
||||
/** Returns an LLVM i64 vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMInt64Vector(const int64_t *i);
|
||||
/** Returns an LLVM i64 vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMUInt64Vector(const uint64_t *i);
|
||||
|
||||
/** Returns an LLVM float vector based on the given array of values.
|
||||
The array should have g->target.vectorWidth elements. */
|
||||
extern llvm::Constant *LLVMFloatVector(const float *f);
|
||||
@@ -152,6 +208,6 @@ extern llvm::Constant *LLVMMaskAllOff;
|
||||
pointers to that type. (In practice, an array of pointers, since LLVM
|
||||
prohibits vectors of pointers.
|
||||
*/
|
||||
extern const llvm::ArrayType *LLVMPointerVectorType(const llvm::Type *t);
|
||||
extern LLVM_TYPE_CONST llvm::ArrayType *LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t);
|
||||
|
||||
#endif // ISPC_LLVMUTIL_H
|
||||
|
||||
16
main.cpp
16
main.cpp
@@ -91,7 +91,11 @@ static void usage(int ret) {
|
||||
printf(" disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
|
||||
printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
|
||||
printf(" disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
|
||||
printf(" [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default)\n");
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
printf(" [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
|
||||
#else
|
||||
printf(" [--target={sse2,sse4,sse4x2}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
|
||||
#endif // LLVM 3.0
|
||||
printf(" [--version]\t\t\t\tPrint ispc version\n");
|
||||
printf(" [--woff]\t\t\t\tDisable warnings\n");
|
||||
printf(" [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
|
||||
@@ -118,11 +122,13 @@ static void lDoTarget(const char *target) {
|
||||
g->target.nativeVectorWidth = 4;
|
||||
g->target.vectorWidth = 8;
|
||||
}
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
else if (!strcasecmp(target, "avx")) {
|
||||
g->target.isa = Target::AVX;
|
||||
g->target.nativeVectorWidth = 8;
|
||||
g->target.vectorWidth = 8;
|
||||
}
|
||||
#endif // LLVM 3.0
|
||||
else
|
||||
usage(1);
|
||||
}
|
||||
@@ -192,7 +198,7 @@ int main(int Argc, char *Argv[]) {
|
||||
// as we're parsing below
|
||||
g = new Globals;
|
||||
|
||||
bool debugSet = false, optSet = false;
|
||||
bool debugSet = false, optSet = false, targetSet = false;
|
||||
Module::OutputType ot = Module::Object;
|
||||
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
@@ -226,6 +232,7 @@ int main(int Argc, char *Argv[]) {
|
||||
else if (!strcmp(argv[i], "--target")) {
|
||||
if (++i == argc) usage(1);
|
||||
lDoTarget(argv[i]);
|
||||
targetSet = true;
|
||||
}
|
||||
else if (!strncmp(argv[i], "--target=", 9)) {
|
||||
const char *target = argv[i] + 9;
|
||||
@@ -315,6 +322,11 @@ int main(int Argc, char *Argv[]) {
|
||||
if (debugSet && !optSet)
|
||||
g->opt.level = 0;
|
||||
|
||||
// Make SSE2 the default target on atom unless the target has been set
|
||||
// explicitly.
|
||||
if (!targetSet && (g->target.cpu == "atom"))
|
||||
lDoTarget("sse2");
|
||||
|
||||
m = new Module(file);
|
||||
if (m->CompileFile() == 0) {
|
||||
if (outFileName != NULL)
|
||||
|
||||
472
module.cpp
472
module.cpp
@@ -70,6 +70,7 @@
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/Support/FormattedStream.h>
|
||||
#include <llvm/Support/FileUtilities.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/Target/TargetRegistry.h>
|
||||
#include <llvm/Target/TargetSelect.h>
|
||||
@@ -79,6 +80,9 @@
|
||||
#include <llvm/PassManager.h>
|
||||
#include <llvm/Analysis/Verifier.h>
|
||||
#include <llvm/Support/CFG.h>
|
||||
#include <clang/Frontend/CompilerInstance.h>
|
||||
#include <clang/Frontend/Utils.h>
|
||||
#include <clang/Basic/TargetInfo.h>
|
||||
#ifndef LLVM_2_8
|
||||
#include <llvm/Support/ToolOutputFile.h>
|
||||
#include <llvm/Support/Host.h>
|
||||
@@ -114,16 +118,27 @@ Module::Module(const char *fn) {
|
||||
// If we're generating debugging symbols, let the DIBuilder know that
|
||||
// we're starting a new compilation unit.
|
||||
if (diBuilder != NULL) {
|
||||
std::string directory, name;
|
||||
GetDirectoryAndFileName(g->currentDirectory, filename, &directory,
|
||||
&name);
|
||||
diBuilder->createCompileUnit(llvm::dwarf::DW_LANG_C99, /* lang */
|
||||
name, /* filename */
|
||||
directory, /* directory */
|
||||
"ispc", /* producer */
|
||||
g->opt.level > 0 /* is optimized */,
|
||||
"-g", /* command line args */
|
||||
0 /* run time version */);
|
||||
if (filename == NULL) {
|
||||
// Unfortunately we can't yet call Error() since the global 'm'
|
||||
// variable hasn't been initialized yet.
|
||||
fprintf(stderr, "Can't emit debugging information with no "
|
||||
"source file on disk.\n");
|
||||
++errorCount;
|
||||
delete diBuilder;
|
||||
diBuilder = NULL;
|
||||
}
|
||||
else {
|
||||
std::string directory, name;
|
||||
GetDirectoryAndFileName(g->currentDirectory, filename, &directory,
|
||||
&name);
|
||||
diBuilder->createCompileUnit(llvm::dwarf::DW_LANG_C99, /* lang */
|
||||
name, /* filename */
|
||||
directory, /* directory */
|
||||
"ispc", /* producer */
|
||||
g->opt.level > 0 /* is optimized */,
|
||||
"-g", /* command line args */
|
||||
0 /* run time version */);
|
||||
}
|
||||
}
|
||||
#endif // LLVM_2_8
|
||||
}
|
||||
@@ -133,8 +148,9 @@ extern FILE *yyin;
|
||||
extern int yyparse();
|
||||
typedef struct yy_buffer_state *YY_BUFFER_STATE;
|
||||
extern void yy_switch_to_buffer(YY_BUFFER_STATE);
|
||||
extern YY_BUFFER_STATE yy_scan_string(const char *);
|
||||
extern YY_BUFFER_STATE yy_create_buffer(FILE *, int);
|
||||
|
||||
extern void yy_delete_buffer(YY_BUFFER_STATE);
|
||||
|
||||
int
|
||||
Module::CompileFile() {
|
||||
@@ -146,63 +162,28 @@ Module::CompileFile() {
|
||||
|
||||
bool runPreprocessor = g->runCPP;
|
||||
|
||||
// We currently require that the user run the preprocessor by hand on
|
||||
// windows and pipe the result to ispc.
|
||||
// FIXME: It'd be nice to run cl.exe for them to do this, if it's available
|
||||
// in the PATH...
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
runPreprocessor = false;
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
|
||||
// The FILE handle that we'll point the parser at. This may end up
|
||||
// being stdin, an opened file on disk, or the piped output from the
|
||||
// preprocessor.
|
||||
FILE *f;
|
||||
|
||||
if (runPreprocessor) {
|
||||
// Before we run the preprocessor, make sure that file exists and
|
||||
// we can read it since otherwise we get a pretty obscure/unhelpful
|
||||
// error message from cpp
|
||||
if (filename) {
|
||||
f = fopen(filename, "r");
|
||||
if (f == NULL) {
|
||||
if (filename != NULL) {
|
||||
// Try to open the file first, since otherwise we crash in the
|
||||
// preprocessor if the file doesn't exist.
|
||||
FILE *f = fopen(filename, "r");
|
||||
if (!f) {
|
||||
perror(filename);
|
||||
return 1;
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
// Go ahead and construct a command string to run the preprocessor.
|
||||
// First, concatentate all of the -D statements from the original
|
||||
// ispc command line so that we can pass them along to cpp.
|
||||
std::string cppDefs;
|
||||
for (unsigned int i = 0; i < g->cppArgs.size(); ++i) {
|
||||
cppDefs += g->cppArgs[i];
|
||||
cppDefs += ' ';
|
||||
}
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
// For now, this code should never be reached
|
||||
FATAL("Need to implement code to run the preprocessor for windows");
|
||||
#else // ISPC_IS_WINDOWS
|
||||
char *cmd = NULL;
|
||||
if (asprintf(&cmd, "/usr/bin/cpp -DISPC=1 -DPI=3.1415926536 %s %s",
|
||||
cppDefs.c_str(), filename ? filename : "-") == -1) {
|
||||
fprintf(stderr, "Unable to allocate memory in asprintf()?!\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
f = popen(cmd, "r");
|
||||
free(cmd);
|
||||
|
||||
if (f == NULL) {
|
||||
perror(filename ? filename : "<stdin>");
|
||||
return 1;
|
||||
}
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
std::string buffer;
|
||||
llvm::raw_string_ostream os(buffer);
|
||||
execPreprocessor((filename != NULL) ? filename : "-", &os);
|
||||
YY_BUFFER_STATE strbuf = yy_scan_string(os.str().c_str());
|
||||
yyparse();
|
||||
yy_delete_buffer(strbuf);
|
||||
}
|
||||
else {
|
||||
// No preprocessor, just open up the file if it's not stdin..
|
||||
FILE* f = NULL;
|
||||
if (filename == NULL)
|
||||
f = stdin;
|
||||
else {
|
||||
@@ -212,24 +193,11 @@ Module::CompileFile() {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Here is where the magic happens: parse the file, build the AST, etc.
|
||||
// This in turn will lead to calls back to Module::AddFunction(),
|
||||
// etc...
|
||||
yyin = f;
|
||||
yy_switch_to_buffer(yy_create_buffer(yyin, 4096));
|
||||
yyparse();
|
||||
|
||||
if (runPreprocessor) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
FATAL("need to implement this for windows as well");
|
||||
#else
|
||||
pclose(f);
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
else
|
||||
yyin = f;
|
||||
yy_switch_to_buffer(yy_create_buffer(yyin, 4096));
|
||||
yyparse();
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
if (errorCount == 0)
|
||||
Optimize(module, g->opt.level);
|
||||
@@ -248,8 +216,8 @@ lRecursiveCheckVarying(const Type *t) {
|
||||
|
||||
const StructType *st = dynamic_cast<const StructType *>(t);
|
||||
if (st) {
|
||||
for (int i = 0; i < st->NumElements(); ++i)
|
||||
if (lRecursiveCheckVarying(st->GetMemberType(i)))
|
||||
for (int i = 0; i < st->GetElementCount(); ++i)
|
||||
if (lRecursiveCheckVarying(st->GetElementType(i)))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@@ -360,7 +328,7 @@ lInitFunSymDecl(DeclSpecs *ds, Declarator *decl) {
|
||||
|
||||
// Get the LLVM FunctionType
|
||||
bool includeMask = (ds->storageClass != SC_EXTERN_C);
|
||||
const llvm::FunctionType *llvmFunctionType =
|
||||
LLVM_TYPE_CONST llvm::FunctionType *llvmFunctionType =
|
||||
functionType->LLVMFunctionType(g->ctx, includeMask);
|
||||
if (llvmFunctionType == NULL)
|
||||
return false;
|
||||
@@ -529,7 +497,7 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llvm::Type *llvmType = decl->sym->type->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *llvmType = decl->sym->type->LLVMType(g->ctx);
|
||||
llvm::GlobalValue::LinkageTypes linkage =
|
||||
(ds->storageClass == SC_STATIC) ? llvm::GlobalValue::InternalLinkage :
|
||||
llvm::GlobalValue::ExternalLinkage;
|
||||
@@ -548,8 +516,12 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
|
||||
decl->initExpr = decl->initExpr->TypeCheck();
|
||||
if (decl->initExpr != NULL) {
|
||||
// We need to make sure the initializer expression is
|
||||
// the same type as the global
|
||||
decl->initExpr = decl->initExpr->TypeConv(decl->sym->type, "initializer");
|
||||
// the same type as the global. (But not if it's an
|
||||
// ExprList; they don't have types per se / can't type
|
||||
// convert themselves anyway.)
|
||||
if (dynamic_cast<ExprList *>(decl->initExpr) == NULL)
|
||||
decl->initExpr =
|
||||
decl->initExpr->TypeConv(decl->sym->type, "initializer");
|
||||
|
||||
if (decl->initExpr != NULL) {
|
||||
decl->initExpr = decl->initExpr->Optimize();
|
||||
@@ -619,7 +591,7 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, Declarator *decl,
|
||||
llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
|
||||
|
||||
// Get the type of the argument we're copying in and its Symbol pointer
|
||||
const llvm::Type *argType = argStructType->getElementType(i);
|
||||
LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
|
||||
Declaration *pdecl = (*decl->functionArgs)[i];
|
||||
assert(pdecl->declarators.size() == 1);
|
||||
Symbol *sym = pdecl->declarators[0]->sym;
|
||||
@@ -683,6 +655,14 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
assert(threadCountSym);
|
||||
threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
|
||||
ctx->StoreInst(threadCount, threadCountSym->storagePtr);
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
// On Windows, we dynamically-allocate space for the task arguments
|
||||
// (see FunctionEmitContext::LaunchInst().) Here is where we emit
|
||||
// the code to free that memory, now that we've copied the
|
||||
// parameter values out of the structure.
|
||||
ctx->EmitFree(structParamPtr);
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
else {
|
||||
// Regular, non-task function
|
||||
@@ -818,20 +798,22 @@ Module::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
|
||||
}
|
||||
|
||||
if (errorCount == 0) {
|
||||
if (g->debugPrint) {
|
||||
llvm::PassManager ppm;
|
||||
ppm.add(llvm::createPrintModulePass(&llvm::outs()));
|
||||
ppm.run(*module);
|
||||
if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
|
||||
if (g->debugPrint) {
|
||||
llvm::PassManager ppm;
|
||||
ppm.add(llvm::createPrintModulePass(&llvm::outs()));
|
||||
ppm.run(*module);
|
||||
}
|
||||
FATAL("Function verificication failed");
|
||||
}
|
||||
|
||||
llvm::verifyFunction(*function);
|
||||
|
||||
// If the function is 'export'-qualified, emit a second version of
|
||||
// it without a mask parameter and without name mangling so that
|
||||
// the application can call it
|
||||
if (ds->storageClass == SC_EXPORT) {
|
||||
if (!functionType->isTask) {
|
||||
const llvm::FunctionType *ftype = functionType->LLVMFunctionType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::FunctionType *ftype =
|
||||
functionType->LLVMFunctionType(g->ctx);
|
||||
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
|
||||
llvm::Function *appFunction =
|
||||
llvm::Function::Create(ftype, linkage, funSym->name.c_str(), module);
|
||||
@@ -847,8 +829,17 @@ Module::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
|
||||
FunctionEmitContext ec(functionType->GetReturnType(), appFunction, funSym,
|
||||
firstStmtPos);
|
||||
lEmitFunctionCode(&ec, appFunction, functionType, funSym, decl, code);
|
||||
if (errorCount == 0)
|
||||
llvm::verifyFunction(*appFunction);
|
||||
if (errorCount == 0) {
|
||||
if (llvm::verifyFunction(*appFunction,
|
||||
llvm::ReturnStatusAction) == true) {
|
||||
if (g->debugPrint) {
|
||||
llvm::PassManager ppm;
|
||||
ppm.add(llvm::createPrintModulePass(&llvm::outs()));
|
||||
ppm.run(*module);
|
||||
}
|
||||
FATAL("Function verificication failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -932,6 +923,9 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {
|
||||
bool
|
||||
Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName) {
|
||||
llvm::InitializeAllTargets();
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::InitializeAllTargetMCs();
|
||||
#endif
|
||||
llvm::InitializeAllAsmPrinters();
|
||||
llvm::InitializeAllAsmParsers();
|
||||
|
||||
@@ -979,14 +973,22 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
|
||||
}
|
||||
|
||||
std::string featuresString;
|
||||
llvm::TargetMachine *targetMachine = NULL;
|
||||
#if defined LLVM_3_0svn || defined LLVM_3_0
|
||||
if (g->target.isa == Target::AVX)
|
||||
featuresString = "+avx";
|
||||
targetMachine = target->createTargetMachine(triple.getTriple(), g->target.cpu,
|
||||
featuresString);
|
||||
#else
|
||||
if (g->target.cpu.size()) {
|
||||
llvm::SubtargetFeatures features;
|
||||
features.setCPU(g->target.cpu);
|
||||
featuresString = features.getString();
|
||||
}
|
||||
|
||||
llvm::TargetMachine *targetMachine =
|
||||
target->createTargetMachine(triple.getTriple(), featuresString);
|
||||
targetMachine = target->createTargetMachine(triple.getTriple(),
|
||||
featuresString);
|
||||
#endif
|
||||
if (targetMachine == NULL) {
|
||||
fprintf(stderr, "Unable to create target machine for target \"%s\"!",
|
||||
triple.str().c_str());
|
||||
@@ -1034,26 +1036,6 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
|
||||
}
|
||||
|
||||
|
||||
/** Walk through the elements of the given structure; for any elements that
|
||||
are themselves structs, add their Type * to structParamTypes and
|
||||
recursively process their elements.
|
||||
*/
|
||||
static void
|
||||
lRecursiveAddStructs(const StructType *structType,
|
||||
std::vector<const StructType *> &structParamTypes) {
|
||||
for (int i = 0; i < structType->NumElements(); ++i) {
|
||||
const Type *elementBaseType = structType->GetMemberType(i)->GetBaseType();
|
||||
const StructType *elementStructType =
|
||||
dynamic_cast<const StructType *>(elementBaseType);
|
||||
if (elementStructType != NULL) {
|
||||
structParamTypes.push_back(elementStructType);
|
||||
lRecursiveAddStructs(elementStructType, structParamTypes);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** Small structure used in representing dependency graphs of structures
|
||||
(i.e. given a StructType, which other structure types does it have as
|
||||
elements).
|
||||
@@ -1112,9 +1094,9 @@ lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
|
||||
StructDAGNode *node = new StructDAGNode;
|
||||
structToNode[st] = node;
|
||||
|
||||
for (int j = 0; j < st->NumElements(); ++j) {
|
||||
for (int j = 0; j < st->GetElementCount(); ++j) {
|
||||
const StructType *elementStructType =
|
||||
dynamic_cast<const StructType *>(st->GetMemberType(j));
|
||||
dynamic_cast<const StructType *>(st->GetElementType(j));
|
||||
// If this element is a struct type and we haven't already
|
||||
// processed it for the current struct type, then upate th
|
||||
// dependencies and record that this element type has other
|
||||
@@ -1144,8 +1126,8 @@ lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
|
||||
for (unsigned int i = 0; i < sortedTypes.size(); ++i) {
|
||||
const StructType *st = sortedTypes[i];
|
||||
fprintf(file, "struct %s {\n", st->GetStructName().c_str());
|
||||
for (int j = 0; j < st->NumElements(); ++j) {
|
||||
const Type *type = st->GetMemberType(j)->GetAsNonConstType();
|
||||
for (int j = 0; j < st->GetElementCount(); ++j) {
|
||||
const Type *type = st->GetElementType(j)->GetAsNonConstType();
|
||||
std::string d = type->GetCDeclaration(st->GetElementName(j));
|
||||
fprintf(file, " %s;\n", d.c_str());
|
||||
}
|
||||
@@ -1154,6 +1136,42 @@ lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
|
||||
}
|
||||
|
||||
|
||||
/** Emit C declarations of enumerator types to the generated header file.
|
||||
*/
|
||||
static void
|
||||
lEmitEnumDecls(const std::vector<const EnumType *> &enumTypes, FILE *file) {
|
||||
if (enumTypes.size() == 0)
|
||||
return;
|
||||
|
||||
fprintf(file, "///////////////////////////////////////////////////////////////////////////\n");
|
||||
fprintf(file, "// Enumerator types with external visibility from ispc code\n");
|
||||
fprintf(file, "///////////////////////////////////////////////////////////////////////////\n\n");
|
||||
|
||||
for (unsigned int i = 0; i < enumTypes.size(); ++i) {
|
||||
std::string declaration = enumTypes[i]->GetCDeclaration("");
|
||||
fprintf(file, "%s {\n", declaration.c_str());
|
||||
|
||||
// Print the individual enumerators
|
||||
for (int j = 0; j < enumTypes[i]->GetEnumeratorCount(); ++j) {
|
||||
const Symbol *e = enumTypes[i]->GetEnumerator(j);
|
||||
assert(e->constValue != NULL);
|
||||
unsigned int enumValue;
|
||||
int count = e->constValue->AsUInt32(&enumValue);
|
||||
assert(count == 1);
|
||||
|
||||
// Always print an initializer to set the value. We could be
|
||||
// 'clever' here and detect whether the implicit value given by
|
||||
// one plus the previous enumerator value (or zero, for the
|
||||
// first enumerator) is the same as the value stored with the
|
||||
// enumerator, though that doesn't seem worth the trouble...
|
||||
fprintf(file, " %s = %d%c\n", e->name.c_str(), enumValue,
|
||||
(j < enumTypes[i]->GetEnumeratorCount() - 1) ? ',' : ' ');
|
||||
}
|
||||
fprintf(file, "};\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Print declarations of VectorTypes used in 'export'ed parts of the
|
||||
program in the header file.
|
||||
*/
|
||||
@@ -1166,7 +1184,6 @@ lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
|
||||
fprintf(file, "// Vector types with external visibility from ispc code\n");
|
||||
fprintf(file, "///////////////////////////////////////////////////////////////////////////\n\n");
|
||||
|
||||
std::vector<const VectorType *> emittedTypes;
|
||||
int align = g->target.nativeVectorWidth * 4;
|
||||
|
||||
for (unsigned int i = 0; i < types.size(); ++i) {
|
||||
@@ -1174,17 +1191,6 @@ lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
|
||||
const VectorType *vt = types[i]->GetAsNonConstType();
|
||||
int size = vt->GetElementCount();
|
||||
|
||||
// Don't print the declaration for this type if we've already
|
||||
// handled it.
|
||||
//
|
||||
// FIXME: this is n^2, unnecessarily. Being able to compare Type
|
||||
// *s directly will eventually make this much better--can use a
|
||||
// std::set... Probably not going to matter in practice.
|
||||
for (unsigned int j = 0; j < emittedTypes.size(); ++j) {
|
||||
if (Type::Equal(vt, emittedTypes[j]))
|
||||
goto skip;
|
||||
}
|
||||
|
||||
baseDecl = vt->GetBaseType()->GetCDeclaration("");
|
||||
fprintf(file, "#ifdef _MSC_VER\n__declspec( align(%d) ) ", align);
|
||||
fprintf(file, "struct %s%d { %s v[%d]; };\n", baseDecl.c_str(), size,
|
||||
@@ -1193,58 +1199,59 @@ lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
|
||||
fprintf(file, "struct %s%d { %s v[%d]; } __attribute__ ((aligned(%d)));\n",
|
||||
baseDecl.c_str(), size, baseDecl.c_str(), size, align);
|
||||
fprintf(file, "#endif\n");
|
||||
|
||||
emittedTypes.push_back(vt);
|
||||
skip:
|
||||
;
|
||||
}
|
||||
fprintf(file, "\n");
|
||||
}
|
||||
|
||||
|
||||
/** Given a set of StructTypes, walk through their elements and collect the
|
||||
VectorTypes that are present in them.
|
||||
/** Add the given type to the vector, if that type isn't already in there.
|
||||
*/
|
||||
static void
|
||||
lGetVectorsFromStructs(const std::vector<const StructType *> &structParamTypes,
|
||||
std::vector<const VectorType *> *vectorParamTypes) {
|
||||
for (unsigned int i = 0; i < structParamTypes.size(); ++i) {
|
||||
const StructType *structType = structParamTypes[i];
|
||||
for (int j = 0; j < structType->NumElements(); ++j) {
|
||||
const Type *elementType = structType->GetMemberType(j);
|
||||
template <typename T> static void
|
||||
lAddTypeIfNew(const Type *type, std::vector<const T *> *exportedTypes) {
|
||||
type = type->GetAsNonConstType();
|
||||
|
||||
const ArrayType *at = dynamic_cast<const ArrayType *>(elementType);
|
||||
if (at)
|
||||
elementType = at->GetBaseType();
|
||||
// Linear search, so this ends up being n^2. It's unlikely this will
|
||||
// matter in practice, though.
|
||||
for (unsigned int i = 0; i < exportedTypes->size(); ++i)
|
||||
if (Type::Equal((*exportedTypes)[i], type))
|
||||
return;
|
||||
|
||||
const VectorType *vt = dynamic_cast<const VectorType *>(elementType);
|
||||
if (vt != NULL) {
|
||||
// make sure it isn't there already...
|
||||
for (unsigned int k = 0; k < vectorParamTypes->size(); ++k)
|
||||
if (Type::Equal(vt, (*vectorParamTypes)[k]))
|
||||
goto skip;
|
||||
vectorParamTypes->push_back(vt);
|
||||
}
|
||||
skip:
|
||||
;
|
||||
}
|
||||
}
|
||||
const T *castType = dynamic_cast<const T *>(type);
|
||||
assert(castType != NULL);
|
||||
exportedTypes->push_back(castType);
|
||||
}
|
||||
|
||||
|
||||
/** Given an arbitrary type that appears in the app/ispc interface, add it
|
||||
to an appropriate vector if it is a struct, enum, or short vector type.
|
||||
Then, if it's a struct, recursively process its members to do the same.
|
||||
*/
|
||||
static void
|
||||
lGetStructAndVectorTypes(const Type *type,
|
||||
std::vector<const StructType *> *structParamTypes,
|
||||
std::vector<const VectorType *> *vectorParamTypes) {
|
||||
const StructType *st = dynamic_cast<const StructType *>(type->GetBaseType());
|
||||
if (st != NULL)
|
||||
structParamTypes->push_back(st);
|
||||
const VectorType *vt = dynamic_cast<const VectorType *>(type);
|
||||
if (vt != NULL)
|
||||
vectorParamTypes->push_back(vt);
|
||||
vt = dynamic_cast<const VectorType *>(type->GetBaseType());
|
||||
if (vt != NULL)
|
||||
vectorParamTypes->push_back(vt);
|
||||
lGetExportedTypes(const Type *type,
|
||||
std::vector<const StructType *> *exportedStructTypes,
|
||||
std::vector<const EnumType *> *exportedEnumTypes,
|
||||
std::vector<const VectorType *> *exportedVectorTypes) {
|
||||
const ArrayType *arrayType = dynamic_cast<const ArrayType *>(type);
|
||||
const StructType *structType = dynamic_cast<const StructType *>(type);
|
||||
|
||||
if (dynamic_cast<const ReferenceType *>(type) != NULL)
|
||||
lGetExportedTypes(type->GetReferenceTarget(), exportedStructTypes,
|
||||
exportedEnumTypes, exportedVectorTypes);
|
||||
else if (arrayType != NULL)
|
||||
lGetExportedTypes(arrayType->GetElementType(), exportedStructTypes,
|
||||
exportedEnumTypes, exportedVectorTypes);
|
||||
else if (structType != NULL) {
|
||||
lAddTypeIfNew(type, exportedStructTypes);
|
||||
for (int i = 0; i < structType->GetElementCount(); ++i)
|
||||
lGetExportedTypes(structType->GetElementType(i), exportedStructTypes,
|
||||
exportedEnumTypes, exportedVectorTypes);
|
||||
}
|
||||
else if (dynamic_cast<const EnumType *>(type) != NULL)
|
||||
lAddTypeIfNew(type, exportedEnumTypes);
|
||||
else if (dynamic_cast<const VectorType *>(type) != NULL)
|
||||
lAddTypeIfNew(type, exportedVectorTypes);
|
||||
else
|
||||
assert(dynamic_cast<const AtomicType *>(type) != NULL);
|
||||
}
|
||||
|
||||
|
||||
@@ -1252,18 +1259,21 @@ lGetStructAndVectorTypes(const Type *type,
|
||||
present in the parameters to them.
|
||||
*/
|
||||
static void
|
||||
lGetStructAndVectorParams(const std::vector<Symbol *> &funcs,
|
||||
std::vector<const StructType *> *structParamTypes,
|
||||
std::vector<const VectorType *> *vectorParamTypes) {
|
||||
lGetExportedParamTypes(const std::vector<Symbol *> &funcs,
|
||||
std::vector<const StructType *> *exportedStructTypes,
|
||||
std::vector<const EnumType *> *exportedEnumTypes,
|
||||
std::vector<const VectorType *> *exportedVectorTypes) {
|
||||
for (unsigned int i = 0; i < funcs.size(); ++i) {
|
||||
const FunctionType *ftype = dynamic_cast<const FunctionType *>(funcs[i]->type);
|
||||
lGetStructAndVectorTypes(ftype->GetReturnType(), structParamTypes,
|
||||
vectorParamTypes);
|
||||
// Handle the return type
|
||||
lGetExportedTypes(ftype->GetReturnType(), exportedStructTypes,
|
||||
exportedEnumTypes, exportedVectorTypes);
|
||||
|
||||
// And now the parameter types...
|
||||
const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
|
||||
for (unsigned int j = 0; j < argTypes.size(); ++j) {
|
||||
lGetStructAndVectorTypes(argTypes[j], structParamTypes,
|
||||
vectorParamTypes);
|
||||
}
|
||||
for (unsigned int j = 0; j < argTypes.size(); ++j)
|
||||
lGetExportedTypes(argTypes[j], exportedStructTypes,
|
||||
exportedEnumTypes, exportedVectorTypes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1337,6 +1347,21 @@ Module::writeHeader(const char *fn) {
|
||||
fprintf(f, "#ifndef %s\n#define %s\n\n", guard.c_str(), guard.c_str());
|
||||
|
||||
fprintf(f, "#include <stdint.h>\n\n");
|
||||
|
||||
switch (g->target.isa) {
|
||||
case Target::SSE2:
|
||||
fprintf(f, "#define ISPC_TARGET_SSE2\n\n");
|
||||
break;
|
||||
case Target::SSE4:
|
||||
fprintf(f, "#define ISPC_TARGET_SSE4\n\n");
|
||||
break;
|
||||
case Target::AVX:
|
||||
fprintf(f, "#define ISPC_TARGET_AVX\n\n");
|
||||
break;
|
||||
default:
|
||||
FATAL("Unhandled target in header emission");
|
||||
}
|
||||
|
||||
fprintf(f, "#ifdef __cplusplus\nnamespace ispc {\n#endif // __cplusplus\n\n");
|
||||
|
||||
if (g->emitInstrumentation) {
|
||||
@@ -1352,42 +1377,25 @@ Module::writeHeader(const char *fn) {
|
||||
m->symbolTable->GetMatchingFunctions(lIsExported, &exportedFuncs);
|
||||
m->symbolTable->GetMatchingFunctions(lIsExternC, &externCFuncs);
|
||||
|
||||
// Get all of the structs used as function parameters and extern
|
||||
// globals. These vectors may have repeats.
|
||||
std::vector<const StructType *> structParamTypes;
|
||||
std::vector<const VectorType *> vectorParamTypes;
|
||||
lGetStructAndVectorParams(exportedFuncs, &structParamTypes, &vectorParamTypes);
|
||||
lGetStructAndVectorParams(externCFuncs, &structParamTypes, &vectorParamTypes);
|
||||
// Get all of the struct, vector, and enumerant types used as function
|
||||
// parameters. These vectors may have repeats.
|
||||
std::vector<const StructType *> exportedStructTypes;
|
||||
std::vector<const EnumType *> exportedEnumTypes;
|
||||
std::vector<const VectorType *> exportedVectorTypes;
|
||||
lGetExportedParamTypes(exportedFuncs, &exportedStructTypes,
|
||||
&exportedEnumTypes, &exportedVectorTypes);
|
||||
lGetExportedParamTypes(externCFuncs, &exportedStructTypes,
|
||||
&exportedEnumTypes, &exportedVectorTypes);
|
||||
|
||||
// And do same for the 'extern' globals
|
||||
// And do the same for the 'extern' globals
|
||||
for (unsigned int i = 0; i < externGlobals.size(); ++i)
|
||||
lGetStructAndVectorTypes(externGlobals[i]->type,
|
||||
&structParamTypes, &vectorParamTypes);
|
||||
lGetExportedTypes(externGlobals[i]->type, &exportedStructTypes,
|
||||
&exportedEnumTypes, &exportedVectorTypes);
|
||||
|
||||
// Get all of the structs that the structs we have seen so far they
|
||||
// depend on transitively. Note the array may grow as a result of the
|
||||
// call to lRecursiveAddStructs -> an iterator would be a bad idea
|
||||
// (would be invalidated) -> the value of size() may increase as we go
|
||||
// along. But that's good; that lets us actually get the whole
|
||||
// transitive set of struct types we need.
|
||||
for (unsigned int i = 0; i < structParamTypes.size(); ++i)
|
||||
lRecursiveAddStructs(structParamTypes[i], structParamTypes);
|
||||
|
||||
// Now get the unique struct types. This is an n^2 search, which is
|
||||
// kind of ugly, but unlikely to be a problem in practice.
|
||||
std::vector<const StructType *> uniqueStructTypes;
|
||||
for (unsigned int i = 0; i < structParamTypes.size(); ++i) {
|
||||
for (unsigned int j = 0; j < uniqueStructTypes.size(); ++j)
|
||||
if (Type::Equal(structParamTypes[i], uniqueStructTypes[j]))
|
||||
goto skip;
|
||||
uniqueStructTypes.push_back(structParamTypes[i]);
|
||||
skip:
|
||||
;
|
||||
}
|
||||
|
||||
lGetVectorsFromStructs(uniqueStructTypes, &vectorParamTypes);
|
||||
lEmitVectorTypedefs(vectorParamTypes, f);
|
||||
lEmitStructDecls(uniqueStructTypes, f);
|
||||
// And print them
|
||||
lEmitVectorTypedefs(exportedVectorTypes, f);
|
||||
lEmitEnumDecls(exportedEnumTypes, f);
|
||||
lEmitStructDecls(exportedStructTypes, f);
|
||||
|
||||
// emit externs for globals
|
||||
if (externGlobals.size() > 0) {
|
||||
@@ -1424,3 +1432,45 @@ Module::writeHeader(const char *fn) {
|
||||
fclose(f);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostream) const
|
||||
{
|
||||
clang::CompilerInstance inst;
|
||||
std::string error;
|
||||
|
||||
inst.createFileManager();
|
||||
inst.createDiagnostics(0, NULL);
|
||||
clang::TargetOptions& options = inst.getTargetOpts();
|
||||
|
||||
llvm::Triple triple(module->getTargetTriple());
|
||||
if (triple.getTriple().empty())
|
||||
triple.setTriple(llvm::sys::getHostTriple());
|
||||
|
||||
options.Triple = triple.getTriple();
|
||||
|
||||
clang::TargetInfo* target
|
||||
= clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
|
||||
|
||||
inst.setTarget(target);
|
||||
inst.createSourceManager(inst.getFileManager());
|
||||
inst.InitializeSourceManager(infilename);
|
||||
|
||||
clang::PreprocessorOptions& opts = inst.getPreprocessorOpts();
|
||||
|
||||
//Add defs for ISPC and PI
|
||||
opts.addMacroDef("ISPC");
|
||||
opts.addMacroDef("PI=3.1415926535");
|
||||
|
||||
for (unsigned int i = 0; i < g->cppArgs.size(); ++i) {
|
||||
//Sanity Check, should really begin with -D
|
||||
if (g->cppArgs[i].substr(0,2) == "-D") {
|
||||
opts.addMacroDef(g->cppArgs[i].substr(2));
|
||||
}
|
||||
}
|
||||
inst.createPreprocessor();
|
||||
clang::DoPrintPreprocessedInput(inst.getPreprocessor(),
|
||||
ostream, inst.getPreprocessorOutputOpts());
|
||||
}
|
||||
|
||||
|
||||
7
module.h
7
module.h
@@ -41,6 +41,11 @@
|
||||
|
||||
#include "ispc.h"
|
||||
|
||||
namespace llvm
|
||||
{
|
||||
class raw_string_ostream;
|
||||
}
|
||||
|
||||
class Module {
|
||||
public:
|
||||
/** The name of the source file being compiled should be passed as the
|
||||
@@ -108,6 +113,8 @@ private:
|
||||
|
||||
bool writeHeader(const char *filename);
|
||||
bool writeObjectFileOrAssembly(OutputType outputType, const char *filename);
|
||||
void execPreprocessor(const char *infilename, llvm::raw_string_ostream* ostream) const;
|
||||
|
||||
};
|
||||
|
||||
#endif // ISPC_MODULE_H
|
||||
|
||||
474
parse.yy
474
parse.yy
@@ -94,15 +94,25 @@ static void lAddMaskToSymbolTable(SourcePos pos);
|
||||
static void lAddThreadIndexCountToSymbolTable(SourcePos pos);
|
||||
static std::string lGetAlternates(std::vector<std::string> &alternates);
|
||||
static const char *lGetStorageClassString(StorageClass sc);
|
||||
static bool lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage);
|
||||
static void lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
|
||||
const EnumType *enumType);
|
||||
|
||||
static const char *lBuiltinTokens[] = {
|
||||
"bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor", "char",
|
||||
"cif", "cwhile", "const", "continue", "creturn", "default", "do", "double",
|
||||
"else", "enum", "export", "extern", "false", "float", "for", "goto", "if",
|
||||
"inline", "int", "int32", "int64", "launch", "print", "reference", "return",
|
||||
"inline", "int", "int8", "int16", "int32", "int64", "launch", "print",
|
||||
"reference", "return",
|
||||
"static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
|
||||
"unsigned", "varying", "void", "while", NULL
|
||||
};
|
||||
|
||||
static const char *lParamListTokens[] = {
|
||||
"bool", "char", "const", "double", "enum", "false", "float", "int",
|
||||
"int8", "int16", "int32", "int64", "reference", "struct", "true",
|
||||
"uniform", "unsigned", "varying", "void", NULL
|
||||
};
|
||||
|
||||
%}
|
||||
|
||||
@@ -110,6 +120,7 @@ static const char *lBuiltinTokens[] = {
|
||||
Expr *expr;
|
||||
ExprList *exprList;
|
||||
const Type *type;
|
||||
const AtomicType *atomicType;
|
||||
int typeQualifier;
|
||||
StorageClass storageClass;
|
||||
Stmt *stmt;
|
||||
@@ -121,18 +132,20 @@ static const char *lBuiltinTokens[] = {
|
||||
std::vector<Declarator *> *structDeclaratorList;
|
||||
StructDeclaration *structDeclaration;
|
||||
std::vector<StructDeclaration *> *structDeclarationList;
|
||||
const EnumType *enumType;
|
||||
Symbol *enumerator;
|
||||
std::vector<Symbol *> *enumeratorList;
|
||||
int32_t int32Val;
|
||||
uint32_t uint32Val;
|
||||
double floatVal;
|
||||
int64_t int64Val;
|
||||
uint64_t uint64Val;
|
||||
std::string *stringVal;
|
||||
const char *constCharPtr;
|
||||
}
|
||||
|
||||
|
||||
%token TOKEN_IDENTIFIER TOKEN_INT_CONSTANT TOKEN_UINT_CONSTANT TOKEN_FLOAT_CONSTANT
|
||||
%token TOKEN_STRING_LITERAL TOKEN_TYPE_NAME
|
||||
%token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT TOKEN_INT64_CONSTANT
|
||||
%token TOKEN_UINT64_CONSTANT TOKEN_FLOAT_CONSTANT
|
||||
%token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME
|
||||
%token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
|
||||
%token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
|
||||
%token TOKEN_AND_OP TOKEN_OR_OP TOKEN_MUL_ASSIGN TOKEN_DIV_ASSIGN TOKEN_MOD_ASSIGN
|
||||
@@ -142,7 +155,7 @@ static const char *lBuiltinTokens[] = {
|
||||
%token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK
|
||||
%token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
|
||||
%token TOKEN_CHAR TOKEN_INT TOKEN_UNSIGNED TOKEN_FLOAT TOKEN_DOUBLE
|
||||
%token TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL
|
||||
%token TOKEN_INT8 TOKEN_INT16 TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL
|
||||
%token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE TOKEN_REFERENCE
|
||||
|
||||
%token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH
|
||||
@@ -174,15 +187,21 @@ static const char *lBuiltinTokens[] = {
|
||||
%type <structDeclaration> struct_declaration
|
||||
%type <structDeclarationList> struct_declaration_list
|
||||
|
||||
%type <enumeratorList> enumerator_list
|
||||
%type <enumerator> enumerator
|
||||
%type <enumType> enum_specifier
|
||||
|
||||
%type <type> specifier_qualifier_list struct_or_union_specifier
|
||||
%type <type> enum_specifier type_specifier type_name
|
||||
%type <type> type_specifier type_name
|
||||
%type <type> short_vec_specifier
|
||||
%type <atomicType> atomic_var_type_specifier
|
||||
|
||||
%type <typeQualifier> type_qualifier
|
||||
%type <storageClass> storage_class_specifier
|
||||
%type <declSpecs> declaration_specifiers
|
||||
|
||||
%type <stringVal> string_constant
|
||||
%type <constCharPtr> struct_or_union_name
|
||||
%type <constCharPtr> struct_or_union_name enum_identifier
|
||||
%type <int32Val> int_constant soa_width_specifier
|
||||
|
||||
%start translation_unit
|
||||
@@ -211,12 +230,17 @@ primary_expression
|
||||
Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str());
|
||||
}
|
||||
}
|
||||
| TOKEN_INT_CONSTANT {
|
||||
/* FIXME: should support 64 bit constants (and doubles...) */
|
||||
| TOKEN_INT32_CONSTANT {
|
||||
$$ = new ConstExpr(AtomicType::UniformConstInt32, yylval.int32Val, @1);
|
||||
}
|
||||
| TOKEN_UINT_CONSTANT {
|
||||
$$ = new ConstExpr(AtomicType::UniformConstUInt32, yylval.uint32Val, @1);
|
||||
| TOKEN_UINT32_CONSTANT {
|
||||
$$ = new ConstExpr(AtomicType::UniformConstUInt32, (uint32_t)yylval.int32Val, @1);
|
||||
}
|
||||
| TOKEN_INT64_CONSTANT {
|
||||
$$ = new ConstExpr(AtomicType::UniformConstInt64, yylval.int64Val, @1);
|
||||
}
|
||||
| TOKEN_UINT64_CONSTANT {
|
||||
$$ = new ConstExpr(AtomicType::UniformConstUInt64, (uint64_t)yylval.int64Val, @1);
|
||||
}
|
||||
| TOKEN_FLOAT_CONSTANT {
|
||||
$$ = new ConstExpr(AtomicType::UniformConstFloat, (float)yylval.floatVal, @1);
|
||||
@@ -245,7 +269,7 @@ postfix_expression
|
||||
| TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
|
||||
{ $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true); }
|
||||
| postfix_expression '.' TOKEN_IDENTIFIER
|
||||
{ $$ = new MemberExpr($1, yytext, @1, @3); }
|
||||
{ $$ = MemberExpr::create($1, yytext, @1, @3); }
|
||||
/* | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
|
||||
{ UNIMPLEMENTED }
|
||||
*/
|
||||
@@ -292,9 +316,13 @@ cast_expression
|
||||
// uniform float x = 1. / (float)y;
|
||||
// don't issue an error due to (float)y being inadvertently
|
||||
// and undesirably-to-the-user "varying"...
|
||||
if ($4->GetType()->IsUniformType())
|
||||
$2 = $2->GetAsUniformType();
|
||||
$$ = new TypeCastExpr($2, $4, @1);
|
||||
if ($2 == NULL || $4 == NULL || $4->GetType() == NULL)
|
||||
$$ = NULL;
|
||||
else {
|
||||
if ($4->GetType()->IsUniformType())
|
||||
$2 = $2->GetAsUniformType();
|
||||
$$ = new TypeCastExpr($2, $4, @1);
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
@@ -446,13 +474,15 @@ declaration_specifiers
|
||||
| storage_class_specifier declaration_specifiers
|
||||
{
|
||||
DeclSpecs *ds = (DeclSpecs *)$2;
|
||||
if (ds->storageClass != SC_NONE)
|
||||
Error(@1, "Multiple storage class specifiers in a declaration are illegal. "
|
||||
"(Have provided both \"%s\" and \"%s\".)",
|
||||
lGetStorageClassString(ds->storageClass),
|
||||
lGetStorageClassString($1));
|
||||
else
|
||||
ds->storageClass = $1;
|
||||
if (ds != NULL) {
|
||||
if (ds->storageClass != SC_NONE)
|
||||
Error(@1, "Multiple storage class specifiers in a declaration are illegal. "
|
||||
"(Have provided both \"%s\" and \"%s\".)",
|
||||
lGetStorageClassString(ds->storageClass),
|
||||
lGetStorageClassString($1));
|
||||
else
|
||||
ds->storageClass = $1;
|
||||
}
|
||||
$$ = ds;
|
||||
}
|
||||
| soa_width_specifier
|
||||
@@ -464,10 +494,12 @@ declaration_specifiers
|
||||
| soa_width_specifier declaration_specifiers
|
||||
{
|
||||
DeclSpecs *ds = (DeclSpecs *)$2;
|
||||
if (ds->soaWidth != 0)
|
||||
Error(@1, "soa<> qualifier supplied multiple times in declaration.");
|
||||
else
|
||||
ds->soaWidth = $1;
|
||||
if (ds != NULL) {
|
||||
if (ds->soaWidth != 0)
|
||||
Error(@1, "soa<> qualifier supplied multiple times in declaration.");
|
||||
else
|
||||
ds->soaWidth = $1;
|
||||
}
|
||||
$$ = ds;
|
||||
}
|
||||
| type_specifier
|
||||
@@ -483,9 +515,11 @@ declaration_specifiers
|
||||
| type_specifier declaration_specifiers
|
||||
{
|
||||
DeclSpecs *ds = (DeclSpecs *)$2;
|
||||
if (ds->baseType != NULL)
|
||||
Error(@1, "Multiple types provided for declaration.");
|
||||
ds->baseType = $1;
|
||||
if (ds != NULL) {
|
||||
if (ds->baseType != NULL)
|
||||
Error(@1, "Multiple types provided for declaration.");
|
||||
ds->baseType = $1;
|
||||
}
|
||||
$$ = ds;
|
||||
}
|
||||
| type_qualifier
|
||||
@@ -495,7 +529,8 @@ declaration_specifiers
|
||||
| type_qualifier declaration_specifiers
|
||||
{
|
||||
DeclSpecs *ds = (DeclSpecs *)$2;
|
||||
ds->typeQualifier |= $1;
|
||||
if (ds != NULL)
|
||||
ds->typeQualifier |= $1;
|
||||
$$ = ds;
|
||||
}
|
||||
;
|
||||
@@ -510,14 +545,20 @@ init_declarator_list
|
||||
| init_declarator_list ',' init_declarator
|
||||
{
|
||||
std::vector<Declarator *> *dl = (std::vector<Declarator *> *)$1;
|
||||
dl->push_back($3);
|
||||
if (dl != NULL && $3 != NULL)
|
||||
dl->push_back($3);
|
||||
$$ = $1;
|
||||
}
|
||||
;
|
||||
|
||||
init_declarator
|
||||
: declarator
|
||||
| declarator '=' initializer { $1->initExpr = $3; $$ = $1; }
|
||||
| declarator '=' initializer
|
||||
{
|
||||
if ($1 != NULL)
|
||||
$1->initExpr = $3;
|
||||
$$ = $1;
|
||||
}
|
||||
;
|
||||
|
||||
storage_class_specifier
|
||||
@@ -534,24 +575,34 @@ storage_class_specifier
|
||||
;
|
||||
|
||||
type_specifier
|
||||
: TOKEN_VOID { $$ = AtomicType::Void; }
|
||||
| TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
|
||||
/* | TOKEN_CHAR { UNIMPLEMENTED; } */
|
||||
| TOKEN_INT { $$ = AtomicType::VaryingInt32; }
|
||||
| TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
|
||||
| TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
|
||||
| TOKEN_INT64 { $$ = AtomicType::VaryingInt64; }
|
||||
: atomic_var_type_specifier { $$ = $1; }
|
||||
| TOKEN_TYPE_NAME
|
||||
{ const Type *t = m->symbolTable->LookupType(yytext);
|
||||
assert(t != NULL);
|
||||
$$ = t;
|
||||
}
|
||||
| struct_or_union_specifier { $$ = $1; }
|
||||
| enum_specifier
|
||||
{ UNIMPLEMENTED; }
|
||||
/* | TOKEN_TYPE_NAME
|
||||
{ UNIMPLEMENTED; }
|
||||
*/
|
||||
| enum_specifier { $$ = $1; }
|
||||
;
|
||||
|
||||
atomic_var_type_specifier
|
||||
: TOKEN_VOID { $$ = AtomicType::Void; }
|
||||
| TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
|
||||
| TOKEN_INT8 { $$ = AtomicType::VaryingInt8; }
|
||||
| TOKEN_INT16 { $$ = AtomicType::VaryingInt16; }
|
||||
| TOKEN_INT { $$ = AtomicType::VaryingInt32; }
|
||||
| TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
|
||||
| TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
|
||||
| TOKEN_INT64 { $$ = AtomicType::VaryingInt64; }
|
||||
;
|
||||
|
||||
short_vec_specifier
|
||||
: atomic_var_type_specifier '<' int_constant '>'
|
||||
{
|
||||
Type* vt =
|
||||
new VectorType($1, $3);
|
||||
$$ = vt;
|
||||
}
|
||||
;
|
||||
|
||||
struct_or_union_name
|
||||
@@ -564,9 +615,11 @@ struct_or_union_specifier
|
||||
{
|
||||
std::vector<const Type *> elementTypes;
|
||||
std::vector<std::string> elementNames;
|
||||
GetStructTypesAndNames(*$4, &elementTypes, &elementNames);
|
||||
std::vector<SourcePos> elementPositions;
|
||||
GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
|
||||
&elementPositions);
|
||||
StructType *st = new StructType($2, elementTypes, elementNames,
|
||||
false, true, @2);
|
||||
elementPositions, false, true, @2);
|
||||
m->symbolTable->AddType($2, st, @2);
|
||||
$$ = st;
|
||||
}
|
||||
@@ -574,8 +627,11 @@ struct_or_union_specifier
|
||||
{
|
||||
std::vector<const Type *> elementTypes;
|
||||
std::vector<std::string> elementNames;
|
||||
GetStructTypesAndNames(*$3, &elementTypes, &elementNames);
|
||||
$$ = new StructType("", elementTypes, elementNames, false, true, @1);
|
||||
std::vector<SourcePos> elementPositions;
|
||||
GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
|
||||
&elementPositions);
|
||||
$$ = new StructType("", elementTypes, elementNames, elementPositions,
|
||||
false, true, @1);
|
||||
}
|
||||
| struct_or_union '{' '}'
|
||||
{
|
||||
@@ -593,7 +649,8 @@ struct_or_union_specifier
|
||||
Error(@2, "Struct type \"%s\" unknown.%s", $2, alts.c_str());
|
||||
}
|
||||
else if (dynamic_cast<const StructType *>(st) == NULL)
|
||||
Error(@2, "Type \"%s\" is not a struct type!", $2);
|
||||
Error(@2, "Type \"%s\" is not a struct type! (%s)", $2,
|
||||
st->GetString().c_str());
|
||||
$$ = st;
|
||||
}
|
||||
;
|
||||
@@ -606,13 +663,15 @@ struct_declaration_list
|
||||
: struct_declaration
|
||||
{
|
||||
std::vector<StructDeclaration *> *sdl = new std::vector<StructDeclaration *>;
|
||||
sdl->push_back($1);
|
||||
if (sdl != NULL && $1 != NULL)
|
||||
sdl->push_back($1);
|
||||
$$ = sdl;
|
||||
}
|
||||
| struct_declaration_list struct_declaration
|
||||
{
|
||||
std::vector<StructDeclaration *> *sdl = (std::vector<StructDeclaration *> *)$1;
|
||||
sdl->push_back($2);
|
||||
if (sdl != NULL && $2 != NULL)
|
||||
sdl->push_back($2);
|
||||
$$ = $1;
|
||||
}
|
||||
;
|
||||
@@ -625,29 +684,34 @@ struct_declaration
|
||||
specifier_qualifier_list
|
||||
: type_specifier specifier_qualifier_list
|
||||
| type_specifier
|
||||
| short_vec_specifier
|
||||
| type_qualifier specifier_qualifier_list
|
||||
{
|
||||
if ($1 == TYPEQUAL_UNIFORM)
|
||||
$$ = $2->GetAsUniformType();
|
||||
else if ($1 == TYPEQUAL_VARYING)
|
||||
$$ = $2->GetAsVaryingType();
|
||||
else if ($1 == TYPEQUAL_REFERENCE)
|
||||
$$ = new ReferenceType($2, false);
|
||||
else if ($1 == TYPEQUAL_CONST)
|
||||
$$ = $2->GetAsConstType();
|
||||
else if ($1 == TYPEQUAL_UNSIGNED) {
|
||||
const Type *t = $2->GetAsUnsignedType();
|
||||
if (t)
|
||||
$$ = t;
|
||||
if ($2 != NULL) {
|
||||
if ($1 == TYPEQUAL_UNIFORM)
|
||||
$$ = $2->GetAsUniformType();
|
||||
else if ($1 == TYPEQUAL_VARYING)
|
||||
$$ = $2->GetAsVaryingType();
|
||||
else if ($1 == TYPEQUAL_REFERENCE)
|
||||
$$ = new ReferenceType($2, false);
|
||||
else if ($1 == TYPEQUAL_CONST)
|
||||
$$ = $2->GetAsConstType();
|
||||
else if ($1 == TYPEQUAL_UNSIGNED) {
|
||||
const Type *t = $2->GetAsUnsignedType();
|
||||
if (t)
|
||||
$$ = t;
|
||||
else {
|
||||
Error(@1, "Can't apply \"unsigned\" qualifier to \"%s\" type. Ignoring.",
|
||||
$2->GetString().c_str());
|
||||
$$ = $2;
|
||||
}
|
||||
}
|
||||
else {
|
||||
Error(@1, "Can't apply \"unsigned\" qualifier to \"%s\" type. Ignoring.",
|
||||
$2->GetString().c_str());
|
||||
$$ = $2;
|
||||
UNIMPLEMENTED;
|
||||
}
|
||||
}
|
||||
else {
|
||||
UNIMPLEMENTED;
|
||||
}
|
||||
else
|
||||
$$ = NULL;
|
||||
}
|
||||
/* K&R--implicit int type--e.g. "static foo" -> foo is an int */
|
||||
/* | type_qualifier { UNIMPLEMENTED; }*/
|
||||
@@ -658,13 +722,15 @@ struct_declarator_list
|
||||
: struct_declarator
|
||||
{
|
||||
std::vector<Declarator *> *sdl = new std::vector<Declarator *>;
|
||||
sdl->push_back($1);
|
||||
if ($1 != NULL)
|
||||
sdl->push_back($1);
|
||||
$$ = sdl;
|
||||
}
|
||||
| struct_declarator_list ',' struct_declarator
|
||||
{
|
||||
std::vector<Declarator *> *sdl = (std::vector<Declarator *> *)$1;
|
||||
sdl->push_back($3);
|
||||
if (sdl != NULL && $3 != NULL)
|
||||
sdl->push_back($3);
|
||||
$$ = $1;
|
||||
}
|
||||
;
|
||||
@@ -677,24 +743,98 @@ struct_declarator
|
||||
*/
|
||||
;
|
||||
|
||||
enum_identifier
|
||||
: TOKEN_IDENTIFIER { $$ = strdup(yytext); }
|
||||
|
||||
enum_specifier
|
||||
: TOKEN_ENUM '{' enumerator_list '}'
|
||||
{ UNIMPLEMENTED; }
|
||||
| TOKEN_ENUM TOKEN_IDENTIFIER '{' enumerator_list '}'
|
||||
{ UNIMPLEMENTED; }
|
||||
| TOKEN_ENUM TOKEN_IDENTIFIER
|
||||
{ UNIMPLEMENTED; }
|
||||
{
|
||||
if ($3 != NULL) {
|
||||
EnumType *enumType = new EnumType(@1);
|
||||
|
||||
lFinalizeEnumeratorSymbols(*$3, enumType);
|
||||
for (unsigned int i = 0; i < $3->size(); ++i)
|
||||
m->symbolTable->AddVariable((*$3)[i]);
|
||||
enumType->SetEnumerators(*$3);
|
||||
$$ = enumType;
|
||||
}
|
||||
else
|
||||
$$ = NULL;
|
||||
}
|
||||
| TOKEN_ENUM enum_identifier '{' enumerator_list '}'
|
||||
{
|
||||
if ($4 != NULL) {
|
||||
EnumType *enumType = new EnumType($2, $2);
|
||||
m->symbolTable->AddType($2, enumType, @2);
|
||||
|
||||
lFinalizeEnumeratorSymbols(*$4, enumType);
|
||||
for (unsigned int i = 0; i < $4->size(); ++i)
|
||||
m->symbolTable->AddVariable((*$4)[i]);
|
||||
enumType->SetEnumerators(*$4);
|
||||
$$ = enumType;
|
||||
}
|
||||
else
|
||||
$$ = NULL;
|
||||
}
|
||||
| TOKEN_ENUM enum_identifier
|
||||
{
|
||||
const Type *type = m->symbolTable->LookupType($2);
|
||||
if (type == NULL) {
|
||||
std::vector<std::string> alternates = m->symbolTable->ClosestEnumTypeMatch($2);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@2, "Enum type \"%s\" unknown.%s", $2, alts.c_str());
|
||||
$$ = NULL;
|
||||
}
|
||||
else {
|
||||
const EnumType *enumType = dynamic_cast<const EnumType *>(type);
|
||||
if (enumType == NULL) {
|
||||
Error(@2, "Type \"%s\" is not an enum type (%s).", $2,
|
||||
type->GetString().c_str());
|
||||
$$ = NULL;
|
||||
}
|
||||
else
|
||||
$$ = enumType;
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
enumerator_list
|
||||
: enumerator
|
||||
{ UNIMPLEMENTED; }
|
||||
{
|
||||
if ($1 == NULL)
|
||||
$$ = NULL;
|
||||
else {
|
||||
std::vector<Symbol *> *el = new std::vector<Symbol *>;
|
||||
el->push_back($1);
|
||||
$$ = el;
|
||||
}
|
||||
}
|
||||
| enumerator_list ',' enumerator
|
||||
{
|
||||
if ($1 != NULL && $3 != NULL)
|
||||
$1->push_back($3);
|
||||
$$ = $1;
|
||||
}
|
||||
;
|
||||
|
||||
enumerator
|
||||
: TOKEN_IDENTIFIER
|
||||
| TOKEN_IDENTIFIER '=' constant_expression
|
||||
: enum_identifier
|
||||
{
|
||||
$$ = new Symbol($1, @1);
|
||||
}
|
||||
| enum_identifier '=' constant_expression
|
||||
{
|
||||
int value;
|
||||
if ($1 != NULL && $3 != NULL &&
|
||||
lGetConstantInt($3, &value, @3, "Enumerator value")) {
|
||||
Symbol *sym = new Symbol($1, @1);
|
||||
sym->constValue = new ConstExpr(AtomicType::UniformConstUInt32,
|
||||
(uint32_t)value, @3);
|
||||
$$ = sym;
|
||||
}
|
||||
else
|
||||
$$ = NULL;
|
||||
}
|
||||
;
|
||||
|
||||
type_qualifier
|
||||
@@ -712,7 +852,7 @@ declarator
|
||||
;
|
||||
|
||||
int_constant
|
||||
: TOKEN_INT_CONSTANT { $$ = yylval.int32Val; }
|
||||
: TOKEN_INT32_CONSTANT { $$ = yylval.int32Val; }
|
||||
;
|
||||
|
||||
direct_declarator
|
||||
@@ -724,45 +864,35 @@ direct_declarator
|
||||
| '(' declarator ')' { $$ = $2; }
|
||||
| direct_declarator '[' constant_expression ']'
|
||||
{
|
||||
Expr *size = $3;
|
||||
if (size) size = size->TypeCheck();
|
||||
if (size) {
|
||||
size = size->Optimize();
|
||||
llvm::Constant *cval = size->GetConstant(size->GetType());
|
||||
if (!cval) {
|
||||
Error(@3, "Array dimension must be compile-time constant");
|
||||
$$ = NULL;
|
||||
}
|
||||
else {
|
||||
llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(cval);
|
||||
if (!ci) {
|
||||
Error(@3, "Array dimension must be compile-time integer constant.");
|
||||
$$ = NULL;
|
||||
}
|
||||
$1->AddArrayDimension((int)ci->getZExtValue());
|
||||
$$ = $1;
|
||||
}
|
||||
int size;
|
||||
if ($1 != NULL && lGetConstantInt($3, &size, @3, "Array dimension")) {
|
||||
$1->AddArrayDimension(size);
|
||||
$$ = $1;
|
||||
}
|
||||
else
|
||||
$$ = NULL;
|
||||
}
|
||||
| direct_declarator '[' ']'
|
||||
{
|
||||
$1->AddArrayDimension(-1); // unsized
|
||||
if ($1 != NULL)
|
||||
$1->AddArrayDimension(-1); // unsized
|
||||
$$ = $1;
|
||||
}
|
||||
| direct_declarator '(' parameter_type_list ')'
|
||||
{
|
||||
Declarator *d = (Declarator *)$1;
|
||||
d->isFunction = true;
|
||||
d->functionArgs = $3;
|
||||
if (d != NULL) {
|
||||
d->isFunction = true;
|
||||
d->functionArgs = $3;
|
||||
}
|
||||
$$ = d;
|
||||
}
|
||||
/* K&R? | direct_declarator '(' identifier_list ')' */
|
||||
| direct_declarator '(' ')'
|
||||
{
|
||||
Declarator *d = (Declarator *)$1;
|
||||
d->isFunction = true;
|
||||
if (d != NULL)
|
||||
d->isFunction = true;
|
||||
$$ = d;
|
||||
}
|
||||
;
|
||||
@@ -776,15 +906,33 @@ parameter_list
|
||||
: parameter_declaration
|
||||
{
|
||||
std::vector<Declaration *> *dl = new std::vector<Declaration *>;
|
||||
dl->push_back($1);
|
||||
if ($1 != NULL)
|
||||
dl->push_back($1);
|
||||
$$ = dl;
|
||||
}
|
||||
| parameter_list ',' parameter_declaration
|
||||
{
|
||||
std::vector<Declaration *> *dl = (std::vector<Declaration *> *)$1;
|
||||
dl->push_back($3);
|
||||
if (dl == NULL)
|
||||
// dl may be NULL due to an earlier parse error...
|
||||
dl = new std::vector<Declaration *>;
|
||||
if ($3 != NULL)
|
||||
dl->push_back($3);
|
||||
$$ = dl;
|
||||
}
|
||||
| error
|
||||
{
|
||||
std::vector<std::string> builtinTokens;
|
||||
const char **token = lParamListTokens;
|
||||
while (*token) {
|
||||
builtinTokens.push_back(*token);
|
||||
++token;
|
||||
}
|
||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
||||
$$ = NULL;
|
||||
}
|
||||
;
|
||||
|
||||
parameter_declaration
|
||||
@@ -794,7 +942,8 @@ parameter_declaration
|
||||
}
|
||||
| declaration_specifiers declarator '=' initializer
|
||||
{
|
||||
$2->initExpr = $4;
|
||||
if ($2 != NULL)
|
||||
$2->initExpr = $4;
|
||||
$$ = new Declaration($1, $2);
|
||||
|
||||
}
|
||||
@@ -850,10 +999,14 @@ initializer_list
|
||||
{ $$ = new ExprList($1, @1); }
|
||||
| initializer_list ',' initializer
|
||||
{
|
||||
ExprList *exprList = dynamic_cast<ExprList *>($1);
|
||||
assert(exprList);
|
||||
exprList->exprs.push_back($3);
|
||||
$$ = exprList;
|
||||
if ($1 == NULL)
|
||||
$$ = NULL;
|
||||
else {
|
||||
ExprList *exprList = dynamic_cast<ExprList *>($1);
|
||||
assert(exprList);
|
||||
exprList->exprs.push_back($3);
|
||||
$$ = exprList;
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
@@ -866,6 +1019,19 @@ statement
|
||||
| jump_statement
|
||||
| declaration_statement
|
||||
| print_statement
|
||||
| error
|
||||
{
|
||||
std::vector<std::string> builtinTokens;
|
||||
const char **token = lBuiltinTokens;
|
||||
while (*token) {
|
||||
builtinTokens.push_back(*token);
|
||||
++token;
|
||||
}
|
||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
||||
$$ = NULL;
|
||||
}
|
||||
;
|
||||
|
||||
labeled_statement
|
||||
@@ -897,7 +1063,8 @@ statement_list
|
||||
}
|
||||
| statement_list statement
|
||||
{
|
||||
((StmtList *)$1)->Add($2);
|
||||
if ($1 != NULL)
|
||||
((StmtList *)$1)->Add($2);
|
||||
$$ = $1;
|
||||
}
|
||||
;
|
||||
@@ -1021,8 +1188,9 @@ external_declaration
|
||||
| TOKEN_EXTERN TOKEN_STRING_LITERAL '{' declaration '}' // FIXME: make sure string=="C"
|
||||
| declaration
|
||||
{
|
||||
for (unsigned int i = 0; i < $1->declarators.size(); ++i)
|
||||
m->AddGlobal($1->declSpecs, $1->declarators[i]);
|
||||
if ($1 != NULL)
|
||||
for (unsigned int i = 0; i < $1->declarators.size(); ++i)
|
||||
m->AddGlobal($1->declSpecs, $1->declarators[i]);
|
||||
}
|
||||
;
|
||||
|
||||
@@ -1064,6 +1232,8 @@ lAddFunctionParams(Declarator *decl) {
|
||||
if (decl->functionArgs) {
|
||||
for (unsigned int i = 0; i < decl->functionArgs->size(); ++i) {
|
||||
Declaration *pdecl = (*decl->functionArgs)[i];
|
||||
if (pdecl == NULL)
|
||||
continue;
|
||||
assert(pdecl->declarators.size() == 1);
|
||||
Symbol *sym = pdecl->declarators[0]->sym;
|
||||
#ifndef NDEBUG
|
||||
@@ -1136,3 +1306,81 @@ lGetStorageClassString(StorageClass sc) {
|
||||
}
|
||||
|
||||
|
||||
/** Given an expression, see if it is equal to a compile-time constant
|
||||
integer value. If so, return true and return the value in *value.
|
||||
If the expression isn't a compile-time constant or isn't an integer
|
||||
type, return false.
|
||||
*/
|
||||
static bool
|
||||
lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) {
|
||||
if (expr == NULL)
|
||||
return false;
|
||||
expr = expr->TypeCheck();
|
||||
if (expr == NULL)
|
||||
return false;
|
||||
expr = expr->Optimize();
|
||||
if (expr == NULL)
|
||||
return false;
|
||||
|
||||
llvm::Constant *cval = expr->GetConstant(expr->GetType());
|
||||
if (cval == NULL) {
|
||||
Error(pos, "%s must be a compile-time constant.", usage);
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(cval);
|
||||
if (ci == NULL) {
|
||||
Error(pos, "%s must be a compile-time integer constant.", usage);
|
||||
return false;
|
||||
}
|
||||
*value = (int)ci->getZExtValue();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Given an array of enumerator symbols, make sure each of them has a
|
||||
ConstExpr * in their Symbol::constValue member that stores their
|
||||
unsigned integer value. Symbols that had values explicitly provided
|
||||
in the source file will already have ConstExpr * set; we just need
|
||||
to set the values for the others here.
|
||||
*/
|
||||
static void
|
||||
lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
|
||||
const EnumType *enumType) {
|
||||
enumType = enumType->GetAsConstType();
|
||||
enumType = enumType->GetAsUniformType();
|
||||
|
||||
/* nextVal tracks the value for the next enumerant. It starts from
|
||||
zero and goes up with each successive enumerant. If any of them
|
||||
has a value specified, then nextVal is ignored for that one and is
|
||||
set to one plus that one's value for the default value for the next
|
||||
one. */
|
||||
uint32_t nextVal = 0;
|
||||
|
||||
for (unsigned int i = 0; i < enums.size(); ++i) {
|
||||
enums[i]->type = enumType;
|
||||
if (enums[i]->constValue != NULL) {
|
||||
/* Already has a value, so first update nextVal with it. */
|
||||
int count = enums[i]->constValue->AsUInt32(&nextVal);
|
||||
assert(count == 1);
|
||||
++nextVal;
|
||||
|
||||
/* When the source file as being parsed, the ConstExpr for any
|
||||
enumerant with a specified value was set to have unsigned
|
||||
int32 type, since we haven't created the parent EnumType
|
||||
by then. Therefore, add a little type cast from uint32 to
|
||||
the actual enum type here and optimize it, which will have
|
||||
us end up with a ConstExpr with the desired EnumType... */
|
||||
Expr *castExpr = new TypeCastExpr(enumType, enums[i]->constValue,
|
||||
enums[i]->pos);
|
||||
castExpr = castExpr->Optimize();
|
||||
enums[i]->constValue = dynamic_cast<ConstExpr *>(castExpr);
|
||||
assert(enums[i]->constValue != NULL);
|
||||
}
|
||||
else {
|
||||
enums[i]->constValue = new ConstExpr(enumType, nextVal++,
|
||||
enums[i]->pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
104
run_tests.sh
104
run_tests.sh
@@ -1,40 +1,86 @@
|
||||
#!/bin/zsh
|
||||
#!/bin/bash
|
||||
|
||||
surprises=0
|
||||
verbose=false
|
||||
number=$(ls -1 tests/*.ispc|wc -l)
|
||||
counter=1
|
||||
target=sse4
|
||||
|
||||
echo Running correctness tests
|
||||
while getopts ":vt:h" opt;do
|
||||
case $opt in
|
||||
v) verbose=true
|
||||
;;
|
||||
t) target=$OPTARG
|
||||
;;
|
||||
h) cat <<EOF
|
||||
usage: run_tests.sh [-v] [-t target] [filenames]
|
||||
-v # verbose output
|
||||
-t # specify compilation target (SSE4 is the default).
|
||||
[filenames] # (optional) files to run through testing infrastructure
|
||||
# if none are provided, all in tests/ will be run.
|
||||
EOF
|
||||
exit 1
|
||||
esac
|
||||
done
|
||||
|
||||
for i in tests/*.ispc; do
|
||||
bc=${i%%ispc}bc
|
||||
ispc -O2 $i -woff -o $bc --emit-llvm --target=sse4
|
||||
if [[ $? != 0 ]]; then
|
||||
surprises=1
|
||||
echo Test $i FAILED ispc compile
|
||||
echo
|
||||
else
|
||||
ispc_test $bc
|
||||
shift $(( $OPTIND - 1 ))
|
||||
if [[ "$1" > 0 ]]; then
|
||||
while [[ "$1" > 0 ]]; do
|
||||
i=$1
|
||||
shift
|
||||
echo Running test $i
|
||||
|
||||
bc=${i%%ispc}bc
|
||||
ispc -O2 $i -woff -o $bc --emit-llvm --target=$target
|
||||
if [[ $? != 0 ]]; then
|
||||
surprises=1
|
||||
echo Test $i FAILED ispc_test
|
||||
echo Test $i FAILED ispc compile
|
||||
echo
|
||||
else
|
||||
ispc_test $bc
|
||||
if [[ $? != 0 ]]; then
|
||||
surprises=1
|
||||
echo Test $i FAILED ispc_test
|
||||
echo
|
||||
fi
|
||||
fi
|
||||
/bin/rm -f $bc
|
||||
done
|
||||
else
|
||||
echo Running all correctness tests
|
||||
|
||||
for i in tests/*.ispc; do
|
||||
if $verbose; then
|
||||
echo -en "Running test $counter of $number.\r"
|
||||
fi
|
||||
(( counter++ ))
|
||||
bc=${i%%ispc}bc
|
||||
ispc -O2 $i -woff -o $bc --emit-llvm --target=$target
|
||||
if [[ $? != 0 ]]; then
|
||||
surprises=1
|
||||
echo Test $i FAILED ispc compile
|
||||
echo
|
||||
else
|
||||
ispc_test $bc
|
||||
if [[ $? != 0 ]]; then
|
||||
surprises=1
|
||||
echo Test $i FAILED ispc_test
|
||||
echo
|
||||
fi
|
||||
fi
|
||||
/bin/rm -f $bc
|
||||
done
|
||||
|
||||
echo -e "\nRunning failing tests"
|
||||
for i in failing_tests/*.ispc; do
|
||||
(ispc -O2 $i -woff -o - --emit-llvm | ispc_test -) 2>/dev/null 1>/dev/null
|
||||
if [[ $? == 0 ]]; then
|
||||
surprises=1
|
||||
echo Test $i UNEXPECTEDLY PASSED
|
||||
echo
|
||||
fi
|
||||
# cmp $bc tests_bitcode${bc##tests}
|
||||
# if [[ $? == 0 ]]; then
|
||||
# /bin/rm $bc
|
||||
# fi
|
||||
fi
|
||||
/bin/rm $bc
|
||||
done
|
||||
|
||||
echo Running failing tests
|
||||
for i in failing_tests/*.ispc; do
|
||||
(ispc -O2 $i -woff -o - --emit-llvm | ispc_test -) 2>/dev/null 1>/dev/null
|
||||
if [[ $? == 0 ]]; then
|
||||
surprises=1
|
||||
echo Test $i UNEXPECTEDLY PASSED
|
||||
echo
|
||||
fi
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ $surprises == 0 ]]; then
|
||||
echo No surprises.
|
||||
|
||||
589
stdlib-avx.ll
589
stdlib-avx.ll
@@ -1,589 +0,0 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 8-wide definitions
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
int8_16(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ps(<8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ss(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
%call = call <8 x float> @llvm.x86.avx.rcp.ps(<8 x float> %0)
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul <8 x float> %0, %call
|
||||
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <8 x float> %call, %two_minus
|
||||
ret <8 x float> %iv_mul
|
||||
}
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%vecval = insertelement <8 x float> undef, float %0, i32 0
|
||||
%call = call <8 x float> @llvm.x86.avx.rcp.ss(<8 x float> %vecval)
|
||||
%scall = extractelement <8 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.round.ps(<8 x float>, i32) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.round.ss(<8 x float>, <8 x float>, i32) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 8)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%xi = insertelement <8 x float> undef, float %0, i32 0
|
||||
%xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 8)
|
||||
%rs = extractelement <8 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 9)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <8 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
%xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 9)
|
||||
%rs = extractelement <8 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 10)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <8 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
%xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 10)
|
||||
%rs = extractelement <8 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps(<8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ss(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps(<8 x float> %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <8 x float> %v, %is
|
||||
%v_is_is = fmul <8 x float> %v_is, %is
|
||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <8 x float> %is, %three_sub
|
||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <8 x float> %half_scale
|
||||
}
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <8 x float> undef, float %0, i32 0
|
||||
%vis = call <8 x float> @llvm.x86.avx.rsqrt.ss(<8 x float> %v)
|
||||
%is = extractelement <8 x float> %vis, i32 0
|
||||
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ps(<8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ss(<8 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.sqrt.ps(<8 x float> %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 8, float, @llvm.x86.avx.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fastmath
|
||||
|
||||
declare void @llvm.x86.avx.stmxcsr(i32 *) nounwind
|
||||
declare void @llvm.x86.avx.ldmxcsr(i32 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
call void @llvm.x86.avx.stmxcsr(i32 * %ptr)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.avx.ldmxcsr(i32 * %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; svml
|
||||
|
||||
; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
|
||||
; or, use the macro to call the 4-wide ones twice with our 8-wide
|
||||
; vectors...
|
||||
|
||||
declare <8 x float> @__svml_sin(<8 x float>)
|
||||
declare <8 x float> @__svml_cos(<8 x float>)
|
||||
declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
|
||||
declare <8 x float> @__svml_tan(<8 x float>)
|
||||
declare <8 x float> @__svml_atan(<8 x float>)
|
||||
declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
|
||||
declare <8 x float> @__svml_exp(<8 x float>)
|
||||
declare <8 x float> @__svml_log(<8 x float>)
|
||||
declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.max.ps(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.max.ss(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ps(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ss(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal <8 x float> @__max_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.max.ps(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, float, @llvm.x86.avx.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__min_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.min.ps(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, float, @llvm.x86.avx.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx.pminsd(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
declare <8 x i32> @llvm.x86.avx.pmaxsd(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.pminsd(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.pmaxsd(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx.pminud(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
declare <8 x i32> @llvm.x86.avx.pmaxud(<8 x i32>, <8 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.pminud(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x i32> @llvm.x86.avx.pmaxud(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps(<8 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps(<8 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal float ops
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.hadd.ps(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps(<8 x float> %0, <8 x float> %0)
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps(<8 x float> %v1, <8 x float> %v1)
|
||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v2, i32 4
|
||||
%sum = fadd float %scalar1, %scalar2
|
||||
ret float %sum
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
define internal <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i32> %0, %1
|
||||
ret <8 x i32> %s
|
||||
}
|
||||
|
||||
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%s = add i32 %0, %1
|
||||
ret i32 %s
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%mm = call i32 @__movmsk(<8 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
%ptr = bitcast i8 * %0 to i32 *
|
||||
%val = load i32 * %ptr
|
||||
|
||||
%ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
|
||||
%ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
|
||||
%ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
|
||||
%ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
|
||||
%ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
|
||||
%ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
|
||||
%ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
|
||||
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
|
||||
ret <8 x i32> %ret7
|
||||
|
||||
skip:
|
||||
ret <8 x i32> undef
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%mm = call i32 @__movmsk(<8 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
%ptr = bitcast i8 * %0 to i64 *
|
||||
%val = load i64 * %ptr
|
||||
|
||||
%ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
|
||||
%ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
|
||||
%ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
|
||||
%ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
|
||||
%ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
|
||||
%ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
|
||||
%ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
|
||||
%ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
|
||||
ret <8 x i64> %ret3
|
||||
|
||||
skip:
|
||||
ret <8 x i64> undef
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%mm = call i32 @__movmsk(<8 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
%ptr = bitcast i8 * %0 to <8 x i32> *
|
||||
%val = load <8 x i32> * %ptr, align 4
|
||||
ret <8 x i32> %val
|
||||
|
||||
skip:
|
||||
ret <8 x i32> undef
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||
%mm = call i32 @__movmsk(<8 x i32> %mask)
|
||||
%any_on = icmp ne i32 %mm, 0
|
||||
br i1 %any_on, label %load, label %skip
|
||||
|
||||
load:
|
||||
%ptr = bitcast i8 * %0 to <8 x i64> *
|
||||
%val = load <8 x i64> * %ptr, align 8
|
||||
ret <8 x i64> %val
|
||||
|
||||
skip:
|
||||
ret <8 x i64> undef
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
per_lane(8, <8 x i32> %2, `
|
||||
; compute address for this one
|
||||
%ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <8 x i32> %1, i32 LANE
|
||||
store i32 %storeval_ID, i32 * %ptr_ID')
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
per_lane(8, <8 x i32> %2, `
|
||||
%ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <8 x i64> %1, i32 LANE
|
||||
store i64 %storeval_ID, i64 * %ptr_ID')
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||
%oldValue = load <8 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||
%newAsFloat = bitcast <8 x i32> %1 to <8 x float>
|
||||
%blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
|
||||
<8 x float> %newAsFloat,
|
||||
<8 x float> %mask_as_float)
|
||||
%blendAsInt = bitcast <8 x float> %blend to <8 x i32>
|
||||
store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
; always just serialize it
|
||||
; FIXME: should implement the "do two 32-bit masked stores" stuff that
|
||||
; other targets do...
|
||||
call void @__masked_store_64(<8 x i64>* nocapture %0, <8 x i64> %1, <8 x i32> %2)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd(<4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.sd(<4 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 4, double, @llvm.x86.avx.sqrt.pd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.max.pd(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.max.sd(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.sd(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.min.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 4, double, @llvm.x86.avx.min.pd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 4, double, @llvm.x86.avx.max.pd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
1209
stdlib.ispc
1209
stdlib.ispc
File diff suppressed because it is too large
Load Diff
835
stdlib.m4
835
stdlib.m4
@@ -1,835 +0,0 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;; This file provides a variety of macros used to generate LLVM bitcode
|
||||
;; parametrized in various ways. Implementations of the standard library
|
||||
;; builtins for various targets can use macros from this file to simplify
|
||||
;; generating code for their implementations of those builtins.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
;; Helper macro for calling various SSE instructions for scalar values
|
||||
;; but where the instruction takes a vector parameter.
|
||||
;; $1 : name of variable to put the final value in
|
||||
;; $2 : vector width of the target
|
||||
;; $3 : scalar type of the operand
|
||||
;; $4 : SSE intrinsic name
|
||||
;; $5 : variable name that has the scalar value
|
||||
;; For example, the following call causes the variable %ret to have
|
||||
;; the result of a call to sqrtss with the scalar value in %0
|
||||
;; sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
|
||||
define(`sse_unary_scalar', `
|
||||
%$1_vec = insertelement <$2 x $3> undef, $3 $5, i32 0
|
||||
%$1_val = call <$2 x $3> $4(<$2 x $3> %$1_vec)
|
||||
%$1 = extractelement <$2 x $3> %$1_val, i32 0
|
||||
')
|
||||
|
||||
;; Similar to `sse_unary_scalar', this helper macro is for calling binary
|
||||
;; SSE instructions with scalar values,
|
||||
;; $1: name of variable to put the result in
|
||||
;; $2: vector width of the target
|
||||
;; $3: scalar type of the operand
|
||||
;; $4 : SSE intrinsic name
|
||||
;; $5 : variable name that has the first scalar operand
|
||||
;; $6 : variable name that has the second scalar operand
|
||||
|
||||
define(`sse_binary_scalar', `
|
||||
%$1_veca = insertelement <$2 x $3> undef, $3 $5, i32 0
|
||||
%$1_vecb = insertelement <$2 x $3> undef, $3 $6, i32 0
|
||||
%$1_val = call <$2 x $3> $4(<$2 x $3> %$1_veca, <$2 x $3> %$1_vecb)
|
||||
%$1 = extractelement <$2 x $3> %$1_val, i32 0
|
||||
')
|
||||
|
||||
;; Do a reduction over a 4-wide vector
|
||||
;; $1: type of final scalar result
|
||||
;; $2: 4-wide function that takes 2 4-wide operands and returns the
|
||||
;; element-wise reduction
|
||||
;; $3: scalar function that takes two scalar operands and returns
|
||||
;; the final reduction
|
||||
|
||||
define(`reduce4', `
|
||||
%v1 = shufflevector <4 x $1> %0, <4 x $1> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %0)
|
||||
%m1a = extractelement <4 x $1> %m1, i32 0
|
||||
%m1b = extractelement <4 x $1> %m1, i32 1
|
||||
%m = call $1 $3($1 %m1a, $1 %m1b)
|
||||
ret $1 %m
|
||||
'
|
||||
)
|
||||
|
||||
;; Similar to `reduce4', do a reduction over an 8-wide vector
|
||||
;; $1: type of final scalar result
|
||||
;; $2: 8-wide function that takes 2 8-wide operands and returns the
|
||||
;; element-wise reduction
|
||||
;; $3: scalar function that takes two scalar operands and returns
|
||||
;; the final reduction
|
||||
|
||||
define(`reduce8', `
|
||||
%v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
|
||||
<8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%m1 = call <8 x $1> $2(<8 x $1> %v1, <8 x $1> %0)
|
||||
%v2 = shufflevector <8 x $1> %m1, <8 x $1> undef,
|
||||
<8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%m2 = call <8 x $1> $2(<8 x $1> %v2, <8 x $1> %m1)
|
||||
%m2a = extractelement <8 x $1> %m2, i32 0
|
||||
%m2b = extractelement <8 x $1> %m2, i32 1
|
||||
%m = call $1 $3($1 %m2a, $1 %m2b)
|
||||
ret $1 %m
|
||||
'
|
||||
)
|
||||
|
||||
;; Do an reduction over an 8-wide vector, using a vector reduction function
|
||||
;; that only takes 4-wide vectors
|
||||
;; $1: type of final scalar result
|
||||
;; $2: 4-wide function that takes 2 4-wide operands and returns the
|
||||
;; element-wise reduction
|
||||
;; $3: scalar function that takes two scalar operands and returns
|
||||
;; the final reduction
|
||||
|
||||
define(`reduce8by4', `
|
||||
%v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
|
||||
%v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m2 = call <4 x $1> $2(<4 x $1> %v3, <4 x $1> %m1)
|
||||
%m2a = extractelement <4 x $1> %m2, i32 0
|
||||
%m2b = extractelement <4 x $1> %m2, i32 1
|
||||
%m = call $1 $3($1 %m2a, $1 %m2b)
|
||||
ret $1 %m
|
||||
'
|
||||
)
|
||||
|
||||
|
||||
;; Given a unary function that takes a 2-wide vector and a 4-wide vector
|
||||
;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide
|
||||
;; vector, apply it, and return the corresponding 4-wide vector result
|
||||
;; $1: name of variable into which the final result should go
|
||||
;; $2: scalar type of the vector elements
|
||||
;; $3: 2-wide unary vector function to apply
|
||||
;; $4: 4-wide operand value
|
||||
|
||||
define(`unary2to4', `
|
||||
%$1_0 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
|
||||
%$1_1 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
|
||||
%$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
'
|
||||
)
|
||||
|
||||
;; Similar to `unary2to4', this applies a 2-wide binary function to two 4-wide
|
||||
;; vector operands
|
||||
;; $1: name of variable into which the final result should go
|
||||
;; $2: scalar type of the vector elements
|
||||
;; $3: 2-wide binary vector function to apply
|
||||
;; $4: First 4-wide operand value
|
||||
;; $5: Second 4-wide operand value
|
||||
|
||||
define(`binary2to4', `
|
||||
%$1_0a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%$1_0b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
|
||||
%$1_1a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%$1_1b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
|
||||
%$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
'
|
||||
)
|
||||
|
||||
;; Similar to `unary2to4', this maps a 4-wide unary function to an 8-wide
|
||||
;; vector operand
|
||||
;; $1: name of variable into which the final result should go
|
||||
;; $2: scalar type of the vector elements
|
||||
;; $3: 4-wide unary vector function to apply
|
||||
;; $4: 8-wide operand value
|
||||
|
||||
define(`unary4to8', `
|
||||
%$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
|
||||
%$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
|
||||
%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
'
|
||||
)
|
||||
|
||||
;; And along the lines of `binary2to4', this maps a 4-wide binary function to
|
||||
;; two 8-wide vector operands
|
||||
;; $1: name of variable into which the final result should go
|
||||
;; $2: scalar type of the vector elements
|
||||
;; $3: 4-wide unary vector function to apply
|
||||
;; $4: First 8-wide operand value
|
||||
;; $5: Second 8-wide operand value
|
||||
|
||||
define(`binary4to8', `
|
||||
%$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b)
|
||||
%$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b)
|
||||
%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
'
|
||||
)
|
||||
|
||||
|
||||
;; Maps a 2-wide unary function to an 8-wide vector operand, returning an
|
||||
;; 8-wide vector result
|
||||
;; $1: name of variable into which the final result should go
|
||||
;; $2: scalar type of the vector elements
|
||||
;; $3: 2-wide unary vector function to apply
|
||||
;; $4: 8-wide operand value
|
||||
|
||||
define(`unary2to8', `
|
||||
%$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
|
||||
%$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
|
||||
%$1_2 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
|
||||
%v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
|
||||
%$1_3 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
|
||||
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
|
||||
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
'
|
||||
)
|
||||
|
||||
;; Maps an 2-wide binary function to two 8-wide vector operands
|
||||
;; $1: name of variable into which the final result should go
|
||||
;; $2: scalar type of the vector elements
|
||||
;; $3: 2-wide unary vector function to apply
|
||||
;; $4: First 8-wide operand value
|
||||
;; $5: Second 8-wide operand value
|
||||
|
||||
define(`binary2to8', `
|
||||
%$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
|
||||
%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
|
||||
%$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
|
||||
%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
|
||||
%$1_2a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
|
||||
%$1_2b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
|
||||
%v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
|
||||
%$1_3a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
|
||||
%$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
|
||||
%v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
|
||||
|
||||
%$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
'
|
||||
)
|
||||
|
||||
;; The unary SSE round intrinsic takes a second argument that encodes the
|
||||
;; rounding mode. This macro makes it easier to apply the 4-wide roundps
|
||||
;; to 8-wide vector operands
|
||||
;; $1: value to be rounded
|
||||
;; $2: integer encoding of rounding mode
|
||||
;; FIXME: this just has a ret statement at the end to return the result,
|
||||
;; which is inconsistent with the macros above
|
||||
|
||||
define(`round4to8', `
|
||||
%v0 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
|
||||
%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
|
||||
%ret = shufflevector <4 x float> %r0, <4 x float> %r1,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x float> %ret
|
||||
'
|
||||
)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; stdlib_core
|
||||
;;
|
||||
;; This macro defines a bunch of helper routines that only depend on the
|
||||
;; target's vector width, which it takes as its first parameter.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
define(`stdlib_core', `
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; vector ops
|
||||
|
||||
define internal float @__extract(<$1 x float>, i32) nounwind readnone alwaysinline {
|
||||
%extract = extractelement <$1 x float> %0, i32 %1
|
||||
ret float %extract
|
||||
}
|
||||
|
||||
define internal <$1 x float> @__insert(<$1 x float>, i32,
|
||||
float) nounwind readnone alwaysinline {
|
||||
%insert = insertelement <$1 x float> %0, float %2, i32 %1
|
||||
ret <$1 x float> %insert
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; various bitcasts from one type to another
|
||||
|
||||
define internal <$1 x i32> @__intbits_varying_float(<$1 x float>) nounwind readnone alwaysinline {
|
||||
%float_to_int_bitcast = bitcast <$1 x float> %0 to <$1 x i32>
|
||||
ret <$1 x i32> %float_to_int_bitcast
|
||||
}
|
||||
|
||||
define internal i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
|
||||
%float_to_int_bitcast = bitcast float %0 to i32
|
||||
ret i32 %float_to_int_bitcast
|
||||
}
|
||||
|
||||
define internal <$1 x i64> @__intbits_varying_double(<$1 x double>) nounwind readnone alwaysinline {
|
||||
%double_to_int_bitcast = bitcast <$1 x double> %0 to <$1 x i64>
|
||||
ret <$1 x i64> %double_to_int_bitcast
|
||||
}
|
||||
|
||||
define internal i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
|
||||
%double_to_int_bitcast = bitcast double %0 to i64
|
||||
ret i64 %double_to_int_bitcast
|
||||
}
|
||||
|
||||
define internal <$1 x float> @__floatbits_varying_int32(<$1 x i32>) nounwind readnone alwaysinline {
|
||||
%int_to_float_bitcast = bitcast <$1 x i32> %0 to <$1 x float>
|
||||
ret <$1 x float> %int_to_float_bitcast
|
||||
}
|
||||
|
||||
define internal float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
|
||||
%int_to_float_bitcast = bitcast i32 %0 to float
|
||||
ret float %int_to_float_bitcast
|
||||
}
|
||||
|
||||
define internal <$1 x double> @__doublebits_varying_int64(<$1 x i64>) nounwind readnone alwaysinline {
|
||||
%int_to_double_bitcast = bitcast <$1 x i64> %0 to <$1 x double>
|
||||
ret <$1 x double> %int_to_double_bitcast
|
||||
}
|
||||
|
||||
define internal double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
|
||||
%int_to_double_bitcast = bitcast i64 %0 to double
|
||||
ret double %int_to_double_bitcast
|
||||
}
|
||||
|
||||
define internal <$1 x float> @__undef_varying() nounwind readnone alwaysinline {
|
||||
ret <$1 x float> undef
|
||||
}
|
||||
|
||||
define internal float @__undef_uniform() nounwind readnone alwaysinline {
|
||||
ret float undef
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; stdlib transcendentals
|
||||
;;
|
||||
;; These functions provide entrypoints that call out to the libm
|
||||
;; implementations of the transcendental functions
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
declare float @sinf(float) nounwind readnone
|
||||
declare float @cosf(float) nounwind readnone
|
||||
declare void @sincosf(float, float *, float *) nounwind readnone
|
||||
declare float @tanf(float) nounwind readnone
|
||||
declare float @atanf(float) nounwind readnone
|
||||
declare float @atan2f(float, float) nounwind readnone
|
||||
declare float @expf(float) nounwind readnone
|
||||
declare float @logf(float) nounwind readnone
|
||||
declare float @powf(float, float) nounwind readnone
|
||||
|
||||
define internal float @__stdlib_sin(float) nounwind readnone alwaysinline {
|
||||
%r = call float @sinf(float %0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal float @__stdlib_cos(float) nounwind readnone alwaysinline {
|
||||
%r = call float @cosf(float %0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal void @__stdlib_sincos(float, float *, float *) nounwind readnone alwaysinline {
|
||||
call void @sincosf(float %0, float *%1, float *%2)
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal float @__stdlib_tan(float) nounwind readnone alwaysinline {
|
||||
%r = call float @tanf(float %0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal float @__stdlib_atan(float) nounwind readnone alwaysinline {
|
||||
%r = call float @atanf(float %0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal float @__stdlib_atan2(float, float) nounwind readnone alwaysinline {
|
||||
%r = call float @atan2f(float %0, float %1)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal float @__stdlib_log(float) nounwind readnone alwaysinline {
|
||||
%r = call float @logf(float %0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal float @__stdlib_exp(float) nounwind readnone alwaysinline {
|
||||
%r = call float @expf(float %0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline {
|
||||
%r = call float @powf(float %0, float %1)
|
||||
ret float %r
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Definitions of 8 and 16-bit load and store functions
|
||||
;;
|
||||
;; The `int8_16' macro defines functions related to loading and storing 8 and
|
||||
;; 16-bit values in memory, converting to and from i32. (This is a workaround
|
||||
;; to be able to use in-memory values of types in ispc programs, since the
|
||||
;; compiler doesn't yet support 8 and 16-bit datatypes...
|
||||
;;
|
||||
;; Arguments to pass to `int8_16':
|
||||
;; $1: vector width of the target
|
||||
|
||||
define(`int8_16', `
|
||||
define internal <$1 x i32> @__load_uint8([0 x i32] *, i32 %offset) nounwind alwaysinline {
|
||||
%ptr8 = bitcast [0 x i32] *%0 to i8 *
|
||||
%ptr = getelementptr i8 * %ptr8, i32 %offset
|
||||
%ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
|
||||
%val = load i`'eval(8*$1) * %ptr64, align 1
|
||||
|
||||
%vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
|
||||
; were assuming unsigned, so zero-extend to i32...
|
||||
%ret = zext <$1 x i8> %vval to <$1 x i32>
|
||||
ret <$1 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alwaysinline {
|
||||
%ptr16 = bitcast [0 x i32] *%0 to i16 *
|
||||
%ptr = getelementptr i16 * %ptr16, i32 %offset
|
||||
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
|
||||
%val = load i`'eval(16*$1) * %ptr64, align 2
|
||||
|
||||
%vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
|
||||
; unsigned, so use zero-extent...
|
||||
%ret = zext <$1 x i16> %vval to <$1 x i32>
|
||||
ret <$1 x i32> %ret
|
||||
}
|
||||
|
||||
define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = trunc <$1 x i32> %val32 to <$1 x i8>
|
||||
%val64 = bitcast <$1 x i8> %val to i`'eval(8*$1)
|
||||
|
||||
%mask8 = trunc <$1 x i32> %mask to <$1 x i8>
|
||||
%mask64 = bitcast <$1 x i8> %mask8 to i`'eval(8*$1)
|
||||
%notmask = xor i`'eval(8*$1) %mask64, -1
|
||||
|
||||
%ptr8 = bitcast [0 x i32] *%0 to i8 *
|
||||
%ptr = getelementptr i8 * %ptr8, i32 %offset
|
||||
%ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
|
||||
|
||||
;; load the old value, use logical ops to blend based on the mask, then
|
||||
;; store the result back
|
||||
%old = load i`'eval(8*$1) * %ptr64, align 1
|
||||
%oldmasked = and i`'eval(8*$1) %old, %notmask
|
||||
%newmasked = and i`'eval(8*$1) %val64, %mask64
|
||||
%final = or i`'eval(8*$1) %oldmasked, %newmasked
|
||||
store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = trunc <$1 x i32> %val32 to <$1 x i16>
|
||||
%val64 = bitcast <$1 x i16> %val to i`'eval(16*$1)
|
||||
|
||||
%mask8 = trunc <$1 x i32> %mask to <$1 x i16>
|
||||
%mask64 = bitcast <$1 x i16> %mask8 to i`'eval(16*$1)
|
||||
%notmask = xor i`'eval(16*$1) %mask64, -1
|
||||
|
||||
%ptr16 = bitcast [0 x i32] *%0 to i16 *
|
||||
%ptr = getelementptr i16 * %ptr16, i32 %offset
|
||||
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
|
||||
|
||||
;; as above, use mask to do blending with logical ops...
|
||||
%old = load i`'eval(16*$1) * %ptr64, align 2
|
||||
%oldmasked = and i`'eval(16*$1) %old, %notmask
|
||||
%newmasked = and i`'eval(16*$1) %val64, %mask64
|
||||
%final = or i`'eval(16*$1) %oldmasked, %newmasked
|
||||
store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
|
||||
|
||||
ret void
|
||||
}
|
||||
'
|
||||
)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; packed load and store functions
|
||||
;;
|
||||
;; These define functions to emulate those nice packed load and packed store
|
||||
;; instructions. For packed store, given a pointer to destination array and
|
||||
;; an offset into the array, for each lane where the mask is on, the
|
||||
;; corresponding value for that lane is stored into packed locations in the
|
||||
;; destination array. For packed load, each lane that has an active mask
|
||||
;; loads a sequential value from the array.
|
||||
;;
|
||||
;; $1: vector width of the target
|
||||
;;
|
||||
;; FIXME: use the per_lane macro, defined below, to implement these!
|
||||
|
||||
define(`packed_load_and_store', `
|
||||
declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
|
||||
|
||||
define i32 @__packed_load_active([0 x i32] *, i32 %start_offset, <$1 x i32> * %val_ptr,
|
||||
<$1 x i32> %full_mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
|
||||
%baseptr = bitcast [0 x i32] * %0 to i32 *
|
||||
%startptr = getelementptr i32 * %baseptr, i32 %start_offset
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
|
||||
known_mask:
|
||||
%allon = icmp eq i32 %mask, eval((1 << $1) -1)
|
||||
br i1 %allon, label %all_on, label %not_all_on
|
||||
|
||||
all_on:
|
||||
;; everyone wants to load, so just load an entire vector width in a single
|
||||
;; vector load
|
||||
%vecptr = bitcast i32 *%startptr to <$1 x i32> *
|
||||
%vec_load = load <$1 x i32> *%vecptr, align 4
|
||||
store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
|
||||
ret i32 $1
|
||||
|
||||
not_all_on:
|
||||
%alloff = icmp eq i32 %mask, 0
|
||||
br i1 %alloff, label %all_off, label %unknown_mask
|
||||
|
||||
all_off:
|
||||
;; no one wants to load
|
||||
ret i32 0
|
||||
|
||||
unknown_mask:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
|
||||
%lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
|
||||
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
|
||||
|
||||
; is the current lane on?
|
||||
%and = and i32 %mask, %lanemask
|
||||
%do_load = icmp eq i32 %and, %lanemask
|
||||
br i1 %do_load, label %load, label %loopend
|
||||
|
||||
load:
|
||||
%loadptr = getelementptr i32 *%startptr, i32 %offset
|
||||
%loadval = load i32 *%loadptr
|
||||
%val_ptr_i32 = bitcast <$1 x i32> * %val_ptr to i32 *
|
||||
%storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane
|
||||
store i32 %loadval, i32 *%storeptr
|
||||
%offset1 = add i32 %offset, 1
|
||||
br label %loopend
|
||||
|
||||
loopend:
|
||||
%nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
|
||||
%nextlane = add i32 %lane, 1
|
||||
%nextlanemask = mul i32 %lanemask, 2
|
||||
|
||||
; are we done yet?
|
||||
%test = icmp ne i32 %nextlane, $1
|
||||
br i1 %test, label %loop, label %done
|
||||
|
||||
done:
|
||||
ret i32 %nextoffset
|
||||
}
|
||||
|
||||
define i32 @__packed_store_active([0 x i32] *, i32 %start_offset, <$1 x i32> %vals,
|
||||
<$1 x i32> %full_mask) nounwind alwaysinline {
|
||||
entry:
|
||||
%mask = call i32 @__movmsk(<$1 x i32> %full_mask)
|
||||
%baseptr = bitcast [0 x i32] * %0 to i32 *
|
||||
%startptr = getelementptr i32 * %baseptr, i32 %start_offset
|
||||
%mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
|
||||
br i1 %mask_known, label %known_mask, label %unknown_mask
|
||||
|
||||
known_mask:
|
||||
%allon = icmp eq i32 %mask, eval((1 << $1) -1)
|
||||
br i1 %allon, label %all_on, label %not_all_on
|
||||
|
||||
all_on:
|
||||
%vecptr = bitcast i32 *%startptr to <$1 x i32> *
|
||||
store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
|
||||
ret i32 $1
|
||||
|
||||
not_all_on:
|
||||
%alloff = icmp eq i32 %mask, 0
|
||||
br i1 %alloff, label %all_off, label %unknown_mask
|
||||
|
||||
all_off:
|
||||
ret i32 0
|
||||
|
||||
unknown_mask:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
|
||||
%lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
|
||||
%offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
|
||||
|
||||
; is the current lane on?
|
||||
%and = and i32 %mask, %lanemask
|
||||
%do_store = icmp eq i32 %and, %lanemask
|
||||
br i1 %do_store, label %store, label %loopend
|
||||
|
||||
store:
|
||||
%storeval = extractelement <$1 x i32> %vals, i32 %lane
|
||||
%storeptr = getelementptr i32 *%startptr, i32 %offset
|
||||
store i32 %storeval, i32 *%storeptr
|
||||
%offset1 = add i32 %offset, 1
|
||||
br label %loopend
|
||||
|
||||
loopend:
|
||||
%nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
|
||||
%nextlane = add i32 %lane, 1
|
||||
%nextlanemask = mul i32 %lanemask, 2
|
||||
|
||||
; are we done yet?
|
||||
%test = icmp ne i32 %nextlane, $1
|
||||
br i1 %test, label %loop, label %done
|
||||
|
||||
done:
|
||||
ret i32 %nextoffset
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; per_lane
|
||||
;;
|
||||
;; The scary macro below encapsulates the 'scalarization' idiom--i.e. we have
|
||||
;; some operation that we'd like to perform only for the lanes where the
|
||||
;; mask is on
|
||||
;; $1: vector width of the target
|
||||
;; $2: variable that holds the mask
|
||||
;; $3: block of code to run for each lane that is on
|
||||
;; Inside this code, any instances of the text "LANE" are replaced
|
||||
;; with an i32 value that represents the current lane number
|
||||
|
||||
divert(`-1')
|
||||
# forloop(var, from, to, stmt) - improved version:
|
||||
# works even if VAR is not a strict macro name
|
||||
# performs sanity check that FROM is larger than TO
|
||||
# allows complex numerical expressions in TO and FROM
|
||||
define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
|
||||
`pushdef(`$1', eval(`$2'))_$0(`$1',
|
||||
eval(`$3'), `$4')popdef(`$1')')')
|
||||
define(`_forloop',
|
||||
`$3`'ifelse(indir(`$1'), `$2', `',
|
||||
`define(`$1', incr(indir(`$1')))$0($@)')')
|
||||
divert`'dnl
|
||||
|
||||
; num lanes, mask, code block to do per lane
|
||||
define(`per_lane', `
|
||||
br label %pl_entry
|
||||
|
||||
pl_entry:
|
||||
%pl_mask = call i32 @__movmsk($2)
|
||||
%pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
|
||||
br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
|
||||
|
||||
pl_known_mask:
|
||||
;; the mask is known at compile time; see if it is something we can
|
||||
;; handle more efficiently
|
||||
%pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
|
||||
br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
|
||||
|
||||
pl_all_on:
|
||||
;; the mask is all on--just expand the code for each lane sequentially
|
||||
forloop(i, 0, eval($1-1),
|
||||
`patsubst(`$3', `ID\|LANE', i)')
|
||||
br label %pl_done
|
||||
|
||||
pl_not_all_on:
|
||||
;; not all on--see if it is all off or mixed
|
||||
;; for the mixed case, we just run the general case, though we could
|
||||
;; try to be smart and just emit the code based on what it actually is,
|
||||
;; for example by emitting the code straight-line without a loop and doing
|
||||
;; the lane tests explicitly, leaving later optimization passes to eliminate
|
||||
;; the stuff that is definitely not needed. Not clear if we will frequently
|
||||
;; encounter a mask that is known at compile-time but is not either all on or
|
||||
;; all off...
|
||||
%pl_alloff = icmp eq i32 %pl_mask, 0
|
||||
br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
|
||||
|
||||
pl_unknown_mask:
|
||||
br label %pl_loop
|
||||
|
||||
pl_loop:
|
||||
;; Loop over each lane and see if we want to do the work for this lane
|
||||
%pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
|
||||
%pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
|
||||
|
||||
; is the current lane on? if so, goto do work, otherwise to end of loop
|
||||
%pl_and = and i32 %pl_mask, %pl_lanemask
|
||||
%pl_doit = icmp eq i32 %pl_and, %pl_lanemask
|
||||
br i1 %pl_doit, label %pl_dolane, label %pl_loopend
|
||||
|
||||
pl_dolane:
|
||||
;; If so, substitute in the code from the caller and replace the LANE
|
||||
;; stuff with the current lane number
|
||||
patsubst(`patsubst(`$3', `LANE_ID', `_id')', `LANE', `%pl_lane')
|
||||
br label %pl_loopend
|
||||
|
||||
pl_loopend:
|
||||
%pl_nextlane = add i32 %pl_lane, 1
|
||||
%pl_nextlanemask = mul i32 %pl_lanemask, 2
|
||||
|
||||
; are we done yet?
|
||||
%pl_test = icmp ne i32 %pl_nextlane, $1
|
||||
br i1 %pl_test, label %pl_loop, label %pl_done
|
||||
|
||||
pl_done:
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
;;
|
||||
;; $1: vector width of the target
|
||||
;; $2: scalar type for which to generate functions to do gathers
|
||||
|
||||
; vec width, type
|
||||
define(`gen_gather', `
|
||||
;; Define the utility function to do the gather operation for a single element
|
||||
;; of the type
|
||||
define internal <$1 x $2> @__gather_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %ret,
|
||||
i32 %lane) nounwind readonly alwaysinline {
|
||||
; compute address for this one from the base
|
||||
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
|
||||
%offset64 = zext i32 %offset32 to i64
|
||||
%ptrdelta = add i64 %ptr64, %offset64
|
||||
%ptr = inttoptr i64 %ptrdelta to $2 *
|
||||
|
||||
; load value and insert into returned value
|
||||
%val = load $2 *%ptr
|
||||
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
|
||||
ret <$1 x $2> %updatedret
|
||||
}
|
||||
|
||||
|
||||
define <$1 x $2> @__gather_base_offsets_$2(i8*, <$1 x i32> %offsets,
|
||||
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
entry:
|
||||
%mask = call i32 @__movmsk(<$1 x i32> %vecmask)
|
||||
%ptr64 = ptrtoint i8 * %0 to i64
|
||||
|
||||
%maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
|
||||
br i1 %maskKnown, label %known_mask, label %unknown_mask
|
||||
|
||||
known_mask:
|
||||
%alloff = icmp eq i32 %mask, 0
|
||||
br i1 %alloff, label %gather_all_off, label %unknown_mask
|
||||
|
||||
gather_all_off:
|
||||
ret <$1 x $2> undef
|
||||
|
||||
unknown_mask:
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
; to require that the 0th element of the array being gathered from is always
|
||||
; legal to read from (and we do indeed require that, given the benefits!)
|
||||
;
|
||||
; Set the offset to zero for lanes that are off
|
||||
%offsetsPtr = alloca <$1 x i32>
|
||||
store <$1 x i32> zeroinitializer, <$1 x i32> * %offsetsPtr
|
||||
call void @__masked_store_blend_32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets,
|
||||
<$1 x i32> %vecmask)
|
||||
%newOffsets = load <$1 x i32> * %offsetsPtr
|
||||
|
||||
%ret0 = call <$1 x $2> @__gather_elt_$2(i64 %ptr64, <$1 x i32> %newOffsets,
|
||||
<$1 x $2> undef, i32 0)
|
||||
forloop(lane, 1, eval($1-1),
|
||||
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt_$2(i64 %ptr64,
|
||||
<$1 x i32> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
|
||||
', `LANE', lane), `PREV', eval(lane-1))')
|
||||
ret <$1 x $2> %ret`'eval($1-1)
|
||||
}
|
||||
'
|
||||
)
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gen_scatter
|
||||
;; Emit a function declaration for a scalarized scatter.
|
||||
;;
|
||||
;; $1: target vector width
|
||||
;; $2: scalar type for which we want to generate code to scatter
|
||||
|
||||
define(`gen_scatter', `
|
||||
;; Define the function that descripes the work to do to scatter a single
|
||||
;; value
|
||||
define internal void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
|
||||
i32 %lane) nounwind alwaysinline {
|
||||
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
|
||||
%offset64 = zext i32 %offset32 to i64
|
||||
%ptrdelta = add i64 %ptr64, %offset64
|
||||
%ptr = inttoptr i64 %ptrdelta to $2 *
|
||||
%storeval = extractelement <$1 x $2> %values, i32 %lane
|
||||
store $2 %storeval, $2 * %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_base_offsets_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
%ptr64 = ptrtoint i8 * %base to i64
|
||||
per_lane($1, <$1 x i32> %mask, `
|
||||
call void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
|
||||
ret void
|
||||
}
|
||||
'
|
||||
)
|
||||
@@ -2,10 +2,11 @@
|
||||
|
||||
import sys
|
||||
|
||||
print "const char *stdlib_code = "
|
||||
for line in sys.stdin:
|
||||
l=line.rstrip()
|
||||
l=l.replace('"', '\\"')
|
||||
print "\"" + l + "\\n\""
|
||||
print "char stdlib_code[] = { "
|
||||
|
||||
print ";"
|
||||
for line in sys.stdin:
|
||||
for c in line:
|
||||
print ord(c)
|
||||
print ", "
|
||||
|
||||
print "0 };"
|
||||
|
||||
155
stmt.cpp
155
stmt.cpp
@@ -133,7 +133,7 @@ lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *type,
|
||||
// Initialize things without initializers to the undefined value.
|
||||
// To auto-initialize everything to zero, replace 'UndefValue' with
|
||||
// 'NullValue' in the below
|
||||
const llvm::Type *ltype = type->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = type->LLVMType(g->ctx);
|
||||
ctx->StoreInst(llvm::UndefValue::get(ltype), lvalue);
|
||||
return;
|
||||
}
|
||||
@@ -152,13 +152,14 @@ lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *type,
|
||||
}
|
||||
}
|
||||
|
||||
// Atomic types can't be initialized with { ... } initializer
|
||||
// Atomic types and enums can't be initialized with { ... } initializer
|
||||
// expressions, so print an error and return if that's what we've got
|
||||
// here..
|
||||
if (dynamic_cast<const AtomicType *>(type) != NULL) {
|
||||
if (dynamic_cast<const AtomicType *>(type) != NULL ||
|
||||
dynamic_cast<const EnumType *>(type) != NULL) {
|
||||
if (dynamic_cast<ExprList *>(initExpr) != NULL)
|
||||
Error(initExpr->pos, "Expression list initializers can't be used for "
|
||||
"variable \"%s\' with atomic type \"%s\".", symName,
|
||||
"variable \"%s\' with type \"%s\".", symName,
|
||||
type->GetString().c_str());
|
||||
return;
|
||||
}
|
||||
@@ -178,89 +179,49 @@ lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *type,
|
||||
return;
|
||||
}
|
||||
|
||||
// There are two cases for initializing arrays and vectors; either a single
|
||||
// initializer may be provided (float foo[3] = 0;), in which case all
|
||||
// of the array elements are initialized to the given value, or an
|
||||
// initializer list may be provided (float foo[3] = { 1,2,3 }), in
|
||||
// which case the array elements are initialized with the corresponding
|
||||
// There are two cases for initializing structs, arrays and vectors;
|
||||
// either a single initializer may be provided (float foo[3] = 0;), in
|
||||
// which case all of the elements are initialized to the given value,
|
||||
// or an initializer list may be provided (float foo[3] = { 1,2,3 }),
|
||||
// in which case the elements are initialized with the corresponding
|
||||
// values.
|
||||
const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
|
||||
if (seqType != NULL) {
|
||||
ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
|
||||
if (exprList == NULL) {
|
||||
// We have single expression; loop over the elements of the
|
||||
// array/vector and initialize each of them with it
|
||||
// individually.
|
||||
for (int i = 0; i < seqType->GetElementCount(); ++i) {
|
||||
llvm::Value *ptr = ctx->GetElementPtrInst(lvalue, 0, i, "offset");
|
||||
lInitSymbol(ptr, symName, seqType->GetElementType(), initExpr,
|
||||
ctx, pos);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise make sure that we have the same number of elements
|
||||
// in the { } initializer expression as we have in the
|
||||
// array/vector
|
||||
int nInits = exprList->exprs.size();
|
||||
if (nInits != seqType->GetElementCount()) {
|
||||
const char *actualType = dynamic_cast<const ArrayType *>(type) ?
|
||||
"Array" : "Vector";
|
||||
Error(initExpr->pos, "%s initializer for variable \"%s\" requires "
|
||||
"%d values; %d provided.", actualType, symName,
|
||||
seqType->GetElementCount(), nInits);
|
||||
}
|
||||
else {
|
||||
// And initialize each of the array/vector elements with
|
||||
// the corresponding expression from the ExprList
|
||||
for (int i = 0; i < nInits; ++i) {
|
||||
llvm::Value *ptr = ctx->GetElementPtrInst(lvalue, 0, i, "offset");
|
||||
lInitSymbol(ptr, symName, seqType->GetElementType(),
|
||||
exprList->exprs[i], ctx, pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
const CollectionType *collectionType =
|
||||
dynamic_cast<const CollectionType *>(type);
|
||||
if (collectionType != NULL) {
|
||||
std::string name;
|
||||
if (dynamic_cast<const StructType *>(type) != NULL)
|
||||
name = "struct";
|
||||
else if (dynamic_cast<const ArrayType *>(type) != NULL)
|
||||
name = "array";
|
||||
else if (dynamic_cast<const VectorType *>(type) != NULL)
|
||||
name = "vector";
|
||||
else
|
||||
FATAL("Unexpected CollectionType in lInitSymbol()");
|
||||
|
||||
// Structs can similarly be initialized in one of two ways; either with
|
||||
// a list of expressions in braces, one expression per struct member,
|
||||
// or with a single expression that is used to initialize all struct
|
||||
// members.
|
||||
const StructType *st = dynamic_cast<const StructType *>(type);
|
||||
if (st) {
|
||||
ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
|
||||
if (exprList != NULL) {
|
||||
// The { ... } case; make sure we have the same number of
|
||||
// expressions in the ExprList as we have struct members
|
||||
int nInits = exprList->exprs.size();
|
||||
if (nInits != st->NumElements())
|
||||
Error(initExpr->pos,
|
||||
"Initializer for struct \"%s\" requires %d values; %d provided.",
|
||||
symName, st->NumElements(), nInits);
|
||||
else {
|
||||
// Initialize each struct member with the corresponding
|
||||
// value from the ExprList
|
||||
for (int i = 0; i < nInits; ++i) {
|
||||
llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "structelement");
|
||||
lInitSymbol(ep, symName, st->GetMemberType(i), exprList->exprs[i],
|
||||
ctx, pos);
|
||||
}
|
||||
if (nInits != collectionType->GetElementCount()) {
|
||||
Error(initExpr->pos, "Initializer for %s \"%s\" requires "
|
||||
"%d values; %d provided.", name.c_str(), symName,
|
||||
collectionType->GetElementCount(), nInits);
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize each element with the corresponding value from
|
||||
// the ExprList
|
||||
for (int i = 0; i < nInits; ++i) {
|
||||
llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "element");
|
||||
lInitSymbol(ep, symName, collectionType->GetElementType(i),
|
||||
exprList->exprs[i], ctx, pos);
|
||||
}
|
||||
}
|
||||
else if (initExpr->GetType()->IsNumericType() ||
|
||||
initExpr->GetType()->IsBoolType()) {
|
||||
// Otherwise initialize all of the struct elements in turn with
|
||||
// the initExpr.
|
||||
for (int i = 0; i < st->NumElements(); ++i) {
|
||||
llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "structelement");
|
||||
lInitSymbol(ep, symName, st->GetMemberType(i), initExpr, ctx, pos);
|
||||
}
|
||||
}
|
||||
else {
|
||||
else
|
||||
Error(initExpr->pos, "Can't assign type \"%s\" to \"%s\".",
|
||||
initExpr->GetType()->GetString().c_str(),
|
||||
st->GetString().c_str());
|
||||
}
|
||||
collectionType->GetString().c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -279,6 +240,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
continue;
|
||||
|
||||
Symbol *sym = decl->sym;
|
||||
assert(decl->sym != NULL);
|
||||
const Type *type = sym->type;
|
||||
if (!type)
|
||||
continue;
|
||||
@@ -319,7 +281,8 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llvm::Type *llvmType = type->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *llvmType = type->LLVMType(g->ctx);
|
||||
assert(llvmType != NULL);
|
||||
|
||||
if (declaration->declSpecs->storageClass == SC_STATIC) {
|
||||
// For static variables, we need a compile-time constant value
|
||||
@@ -343,17 +306,19 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
llvm::Twine("static.") +
|
||||
llvm::Twine(sym->pos.first_line) +
|
||||
llvm::Twine(".") + sym->name.c_str());
|
||||
// Tell the FunctionEmitContext about the variable
|
||||
ctx->EmitVariableDebugInfo(sym);
|
||||
}
|
||||
else {
|
||||
// For non-static variables, allocate storage on the stack
|
||||
sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
|
||||
// Tell the FunctionEmitContext about the variable; must do
|
||||
// this before the initializer stuff.
|
||||
ctx->EmitVariableDebugInfo(sym);
|
||||
// And then get it initialized...
|
||||
lInitSymbol(sym->storagePtr, sym->name.c_str(), type, decl->initExpr,
|
||||
ctx, sym->pos);
|
||||
}
|
||||
|
||||
// Finally, tell the FunctionEmitContext about the variable
|
||||
ctx->EmitVariableDebugInfo(sym);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -393,11 +358,16 @@ DeclStmt::Optimize() {
|
||||
|
||||
Stmt *
|
||||
DeclStmt::TypeCheck() {
|
||||
bool encounteredError = false;
|
||||
for (unsigned int i = 0; i < declaration->declarators.size(); ++i) {
|
||||
Declarator *decl = declaration->declarators[i];
|
||||
if (!decl || !decl->initExpr)
|
||||
if (!decl) {
|
||||
encounteredError = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!decl->initExpr)
|
||||
continue;
|
||||
decl->initExpr = decl->initExpr->TypeCheck();
|
||||
if (!decl->initExpr)
|
||||
continue;
|
||||
@@ -406,7 +376,8 @@ DeclStmt::TypeCheck() {
|
||||
// the int->float type conversion is in there and we don't return
|
||||
// an int as the constValue later...
|
||||
const Type *type = decl->sym->type;
|
||||
if (dynamic_cast<const AtomicType *>(type) != NULL) {
|
||||
if (dynamic_cast<const AtomicType *>(type) != NULL ||
|
||||
dynamic_cast<const EnumType *>(type) != NULL) {
|
||||
// If it's an expr list with an atomic type, we'll later issue
|
||||
// an error. Need to leave decl->initExpr as is in that case so it
|
||||
// is in fact caught later, though.
|
||||
@@ -414,7 +385,7 @@ DeclStmt::TypeCheck() {
|
||||
decl->initExpr = decl->initExpr->TypeConv(type, "initializer");
|
||||
}
|
||||
}
|
||||
return this;
|
||||
return encounteredError ? NULL : this;
|
||||
}
|
||||
|
||||
|
||||
@@ -1436,6 +1407,18 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Just int8 and int16 types to int32s...
|
||||
const Type *baseType = type->GetAsNonConstType()->GetAsUniformType();
|
||||
if (baseType == AtomicType::UniformInt8 ||
|
||||
baseType == AtomicType::UniformUInt8 ||
|
||||
baseType == AtomicType::UniformInt16 ||
|
||||
baseType == AtomicType::UniformUInt16) {
|
||||
expr = new TypeCastExpr(type->IsUniformType() ? AtomicType::UniformInt32 :
|
||||
AtomicType::VaryingInt32,
|
||||
expr, expr->pos);
|
||||
type = expr->GetType();
|
||||
}
|
||||
|
||||
char t = lEncodeType(type->GetAsNonConstType());
|
||||
if (t == '\0') {
|
||||
Error(expr->pos, "Only atomic types are allowed in print statements; "
|
||||
@@ -1445,7 +1428,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
|
||||
else {
|
||||
argTypes.push_back(t);
|
||||
|
||||
const llvm::Type *llvmExprType = type->LLVMType(g->ctx);
|
||||
LLVM_TYPE_CONST llvm::Type *llvmExprType = type->LLVMType(g->ctx);
|
||||
llvm::Value *ptr = ctx->AllocaInst(llvmExprType, "print_arg");
|
||||
llvm::Value *val = expr->GetValue(ctx);
|
||||
if (!val)
|
||||
@@ -1459,7 +1442,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
|
||||
|
||||
|
||||
/* PrintStmt works closely with the __do_print() function implemented in
|
||||
the stdlib-c.c file. In particular, the EmitCode() method here needs to
|
||||
the builtins-c.c file. In particular, the EmitCode() method here needs to
|
||||
take the arguments passed to it from ispc and generate a valid call to
|
||||
__do_print() with the information that __do_print() then needs to do the
|
||||
actual printing work at runtime.
|
||||
@@ -1491,7 +1474,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
int nArgs = elist ? elist->exprs.size() : 1;
|
||||
|
||||
// Allocate space for the array of pointers to values to be printed
|
||||
const llvm::Type *argPtrArrayType =
|
||||
LLVM_TYPE_CONST llvm::Type *argPtrArrayType =
|
||||
llvm::ArrayType::get(LLVMTypes::VoidPointerType, nArgs);
|
||||
llvm::Value *argPtrArray = ctx->AllocaInst(argPtrArrayType,
|
||||
"print_arg_ptrs");
|
||||
|
||||
23
sym.cpp
23
sym.cpp
@@ -59,6 +59,7 @@ Symbol::MangledName() const {
|
||||
return name + type->Mangle();
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// SymbolTable
|
||||
|
||||
@@ -257,7 +258,19 @@ SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
|
||||
|
||||
std::vector<std::string>
|
||||
SymbolTable::ClosestTypeMatch(const char *str) const {
|
||||
// This follows the same approach as ClosestVariableOrFunctionmatch()
|
||||
return closestTypeMatch(str, true);
|
||||
}
|
||||
|
||||
|
||||
std::vector<std::string>
|
||||
SymbolTable::ClosestEnumTypeMatch(const char *str) const {
|
||||
return closestTypeMatch(str, false);
|
||||
}
|
||||
|
||||
|
||||
std::vector<std::string>
|
||||
SymbolTable::closestTypeMatch(const char *str, bool structsVsEnums) const {
|
||||
// This follows the same approach as ClosestVariableOrFunctionMatch()
|
||||
// above; compute all edit distances, keep the ones shorter than
|
||||
// maxDelta, return the first non-empty vector of one or more sets of
|
||||
// alternatives with minimal edit distance.
|
||||
@@ -267,6 +280,14 @@ SymbolTable::ClosestTypeMatch(const char *str) const {
|
||||
for (unsigned int i = 0; i < types.size(); ++i) {
|
||||
TypeMapType::const_iterator iter;
|
||||
for (iter = types[i]->begin(); iter != types[i]->end(); ++iter) {
|
||||
// Skip over either StructTypes or EnumTypes, depending on the
|
||||
// value of the structsVsEnums parameter
|
||||
bool isEnum = (dynamic_cast<const EnumType *>(iter->second) != NULL);
|
||||
if (isEnum && structsVsEnums)
|
||||
continue;
|
||||
else if (!isEnum && !structsVsEnums)
|
||||
continue;
|
||||
|
||||
int dist = StringEditDistance(str, iter->first, maxDelta+1);
|
||||
if (dist <= maxDelta)
|
||||
matches[dist].push_back(iter->first);
|
||||
|
||||
5
sym.h
5
sym.h
@@ -219,11 +219,16 @@ public:
|
||||
name. */
|
||||
std::vector<std::string> ClosestTypeMatch(const char *name) const;
|
||||
|
||||
std::vector<std::string> ClosestEnumTypeMatch(const char *name) const;
|
||||
|
||||
/** Prints out the entire contents of the symbol table to standard error.
|
||||
(Debugging method). */
|
||||
void Print();
|
||||
|
||||
private:
|
||||
std::vector<std::string> closestTypeMatch(const char *str,
|
||||
bool structsVsEnums) const;
|
||||
|
||||
/** This member variable holds one \c vector of Symbol pointers for
|
||||
each of the current active scopes as the program is being parsed.
|
||||
New vectors of symbols are added and removed from the end of the
|
||||
|
||||
@@ -11,8 +11,11 @@ void f(reference uniform Foo foo[], float a) {
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
float f[40] = a;
|
||||
float g[40] = b;
|
||||
float f[40], g[40];
|
||||
for (uniform int i = 0; i < 40; ++i) {
|
||||
f[i] = a;
|
||||
g[i] = b;
|
||||
}
|
||||
if (a < 2)
|
||||
f = g;
|
||||
RET[programIndex] = f[a];
|
||||
|
||||
@@ -5,7 +5,11 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform float x[47][47] = 2.;
|
||||
uniform float x[47][47];
|
||||
for (uniform int i = 0; i < 47; ++i)
|
||||
for (uniform int j = 0; j < 47; ++j)
|
||||
x[i][j] = 2+b-5;
|
||||
|
||||
// all are 2 except (3,4) = 0, (1,4) = 1, (2,4) = 1, (4,4) = 1
|
||||
if (a == 3.)
|
||||
x[a][b-1] = 0;
|
||||
|
||||
@@ -4,7 +4,11 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform float x[47][47] = 2.;
|
||||
uniform float x[47][47];
|
||||
for (uniform int i = 0; i < 47; ++i)
|
||||
for (uniform int j = 0; j < 47; ++j)
|
||||
x[i][j] = 2+b-5;
|
||||
|
||||
// all are 2 except (4,2) = 0, (4,...) = 1, (4,programCount-1)=2
|
||||
if (a == 3.)
|
||||
x[b-1][a-1] = 0;
|
||||
|
||||
@@ -5,7 +5,11 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform float x[47][74] = 2.;
|
||||
uniform float x[47][47];
|
||||
for (uniform int i = 0; i < 47; ++i)
|
||||
for (uniform int j = 0; j < 47; ++j)
|
||||
x[i][j] = 2+b-5;
|
||||
|
||||
x[a][b-1] = 0;
|
||||
RET[programIndex] = x[2][a];
|
||||
}
|
||||
|
||||
@@ -4,7 +4,11 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
float x[40][40] = b;
|
||||
float x[40][40];
|
||||
for (uniform int i = 0; i < 40; ++i)
|
||||
for (uniform int j = 0; j < 40; ++j)
|
||||
x[i][j] = b;
|
||||
|
||||
uniform int index[4] = { 0, 1, 2, 4 };
|
||||
float v = index[programIndex & 0x3];
|
||||
x[a][v] = 0;
|
||||
|
||||
@@ -5,7 +5,9 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform float x[40] = 0;
|
||||
uniform float x[40];
|
||||
for (uniform int i = 0; i < 40; ++i)
|
||||
x[i] = 0.;
|
||||
x[a] = 2;
|
||||
RET[programIndex] = x[4] + x[0] + x[5];
|
||||
}
|
||||
|
||||
14
tests/atomics-1.ispc
Normal file
14
tests/atomics-1.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_add_global(s, 1);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = reduce_add(programIndex);
|
||||
}
|
||||
14
tests/atomics-2.ispc
Normal file
14
tests/atomics-2.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int64 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_add_global(s, 1);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = reduce_add(programIndex);
|
||||
}
|
||||
14
tests/atomics-3.ispc
Normal file
14
tests/atomics-3.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0xff;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_xor_global(s, 0xfffffff0);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0xff;
|
||||
}
|
||||
14
tests/atomics-4.ispc
Normal file
14
tests/atomics-4.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_or_global(s, (1<<programIndex));
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (1<<programCount)-1;
|
||||
}
|
||||
14
tests/atomics-5.ispc
Normal file
14
tests/atomics-5.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0xbeef;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_swap_global(s, programIndex);
|
||||
RET[programIndex] = reduce_max(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0xbeef;
|
||||
}
|
||||
14
tests/atomics-6.ispc
Normal file
14
tests/atomics-6.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 2;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_compare_exchange_global(s, programIndex, a*1000);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 3000;
|
||||
}
|
||||
12
tests/broadcast-1.ispc
Normal file
12
tests/broadcast-1.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int br = broadcast(a, (uniform int)b-2);
|
||||
RET[programIndex] = br;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 4;
|
||||
}
|
||||
12
tests/broadcast-2.ispc
Normal file
12
tests/broadcast-2.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int16 a = aFOO[programIndex];
|
||||
int16 b = broadcast(a, 2);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 3;
|
||||
}
|
||||
12
tests/broadcast-3.ispc
Normal file
12
tests/broadcast-3.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int8 a = aFOO[programIndex];
|
||||
int8 br = broadcast(a, (uniform int)b-2);
|
||||
RET[programIndex] = br;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 4;
|
||||
}
|
||||
12
tests/broadcast.ispc
Normal file
12
tests/broadcast.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = broadcast(a, 2);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 3;
|
||||
}
|
||||
11
tests/double-abs-1.ispc
Normal file
11
tests/double-abs-1.ispc
Normal file
@@ -0,0 +1,11 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double a = aFOO[programIndex];
|
||||
RET[programIndex] = abs(-a);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
11
tests/double-abs.ispc
Normal file
11
tests/double-abs.ispc
Normal file
@@ -0,0 +1,11 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double a = aFOO[programIndex];
|
||||
RET[programIndex] = abs(a);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
12
tests/double-max-1.ispc
Normal file
12
tests/double-max-1.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double a = aFOO[programIndex];
|
||||
double b = -2. * a;
|
||||
RET[programIndex] = max(a,b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 * (1 + programIndex);
|
||||
}
|
||||
12
tests/double-max.ispc
Normal file
12
tests/double-max.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double a = aFOO[programIndex];
|
||||
double b = 2. * a;
|
||||
RET[programIndex] = max(a,b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 2 * (1 + programIndex);
|
||||
}
|
||||
12
tests/double-min-1.ispc
Normal file
12
tests/double-min-1.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double a = aFOO[programIndex];
|
||||
double b = -2. * a;
|
||||
RET[programIndex] = min(a,b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = -2 * (1 + programIndex);
|
||||
}
|
||||
12
tests/double-min.ispc
Normal file
12
tests/double-min.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double a = aFOO[programIndex];
|
||||
double b = 2. * a;
|
||||
RET[programIndex] = min(a,b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
16
tests/double-sqrt.ispc
Normal file
16
tests/double-sqrt.ispc
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double a = aFOO[programIndex];
|
||||
if (programIndex & 1) {
|
||||
a *= a;
|
||||
RET[programIndex] = sqrt(a);
|
||||
}
|
||||
else
|
||||
RET[programIndex] = a;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user