Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2f35bc1a0f | ||
|
|
1620e0508d |
20
Makefile
20
Makefile
@@ -25,8 +25,7 @@ BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
|
|||||||
|
|
||||||
CXX=g++
|
CXX=g++
|
||||||
CPP=cpp
|
CPP=cpp
|
||||||
OPT=-g3
|
CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
|
||||||
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
|
|
||||||
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
|
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
|
||||||
|
|
||||||
LDFLAGS=
|
LDFLAGS=
|
||||||
@@ -45,13 +44,13 @@ YACC=bison -d -v -t
|
|||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
|
|
||||||
CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
|
CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
|
||||||
llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
|
llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
|
||||||
util.cpp
|
util.cpp
|
||||||
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
|
HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
|
||||||
opt.h stmt.h sym.h type.h util.h
|
opt.h stmt.h sym.h type.h util.h
|
||||||
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
|
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
|
||||||
builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
|
builtins-sse4.ll builtins-sse4x2.ll
|
||||||
BISON_SRC=parse.yy
|
BISON_SRC=parse.yy
|
||||||
FLEX_SRC=lex.ll
|
FLEX_SRC=lex.ll
|
||||||
|
|
||||||
@@ -112,7 +111,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
|||||||
@echo Compiling $<
|
@echo Compiling $<
|
||||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||||
|
|
||||||
objs/builtins-%.cpp: builtins-%.ll
|
objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
|
||||||
@echo Creating C++ source from builtin definitions file $<
|
@echo Creating C++ source from builtin definitions file $<
|
||||||
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
||||||
|
|
||||||
@@ -143,10 +142,3 @@ objs/stdlib_ispc.cpp: stdlib.ispc
|
|||||||
objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
|
objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
|
||||||
@echo Compiling $<
|
@echo Compiling $<
|
||||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||||
|
|
||||||
objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
|
|
||||||
objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
|
|
||||||
objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
|
|
||||||
objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
|
|
||||||
objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
|
|
||||||
objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll
|
|
||||||
|
|||||||
94
ast.h
94
ast.h
@@ -1,94 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright (c) 2011, Intel Corporation
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the name of Intel Corporation nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived from
|
|
||||||
this software without specific prior written permission.
|
|
||||||
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/** @file ast.h
|
|
||||||
@brief
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef ISPC_AST_H
|
|
||||||
#define ISPC_AST_H 1
|
|
||||||
|
|
||||||
#include "ispc.h"
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
|
|
||||||
|
|
||||||
This class defines a basic interface that all abstract syntax tree
|
|
||||||
(AST) nodes must implement. The base classes for both expressions
|
|
||||||
(Expr) and statements (Stmt) inherit from this class.
|
|
||||||
*/
|
|
||||||
class ASTNode {
|
|
||||||
public:
|
|
||||||
ASTNode(SourcePos p) : pos(p) { }
|
|
||||||
virtual ~ASTNode();
|
|
||||||
|
|
||||||
/** The Optimize() method should perform any appropriate early-stage
|
|
||||||
optimizations on the node (e.g. constant folding). The caller
|
|
||||||
should use the returned ASTNode * in place of the original node.
|
|
||||||
This method may return NULL if an error is encountered during
|
|
||||||
optimization. */
|
|
||||||
virtual ASTNode *Optimize() = 0;
|
|
||||||
|
|
||||||
/** Type checking should be performed by the node when this method is
|
|
||||||
called. In the event of an error, a NULL value may be returned.
|
|
||||||
As with ASTNode::Optimize(), the caller should store the returned
|
|
||||||
pointer in place of the original ASTNode *. */
|
|
||||||
virtual ASTNode *TypeCheck() = 0;
|
|
||||||
|
|
||||||
virtual int EstimateCost() const = 0;
|
|
||||||
|
|
||||||
/** All AST nodes must track the file position where they are
|
|
||||||
defined. */
|
|
||||||
SourcePos pos;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/** Simple representation of the abstract syntax trees for all of the
|
|
||||||
functions declared in a compilation unit.
|
|
||||||
*/
|
|
||||||
class AST {
|
|
||||||
public:
|
|
||||||
/** Add the AST for a function described by the given declaration
|
|
||||||
information and source code. */
|
|
||||||
void AddFunction(Symbol *sym, const std::vector<Symbol *> &args,
|
|
||||||
Stmt *code);
|
|
||||||
|
|
||||||
/** Generate LLVM IR for all of the functions into the current
|
|
||||||
module. */
|
|
||||||
void GenerateIR();
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<Function *> functions;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // ISPC_AST_H
|
|
||||||
16
buildall.bat
16
buildall.bat
@@ -1,16 +0,0 @@
|
|||||||
@echo off
|
|
||||||
|
|
||||||
REM If LLVM_INSTALL_DIR isn't set globally in your environment,
|
|
||||||
REM it can be set here_
|
|
||||||
set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
|
|
||||||
|
|
||||||
REM Both the LLVM binaries and python need to be in the path
|
|
||||||
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
|
|
||||||
|
|
||||||
msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
|
||||||
msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
|
||||||
|
|
||||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
|
|
||||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
|
|
||||||
msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
|
|
||||||
msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
|
|
||||||
@@ -30,14 +30,18 @@
|
|||||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; AVX target implementation.
|
;; *** Untested *** AVX target implementation.
|
||||||
|
;;
|
||||||
|
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||||
|
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||||
|
;; chance that there are bugs in the code in this file.
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rcp
|
;; rcp
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; uniform float iv = extract(__rcp_u(v), 0);
|
; uniform float iv = extract(__rcp_u(v), 0);
|
||||||
; return iv * (2. - v * iv);
|
; return iv * (2. - v * iv);
|
||||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||||
@@ -56,7 +60,7 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
|||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
; the roundss intrinsic is a total mess--docs say:
|
; the roundss intrinsic is a total mess--docs say:
|
||||||
;
|
;
|
||||||
@@ -79,7 +83,7 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
|||||||
ret float %rs
|
ret float %rs
|
||||||
}
|
}
|
||||||
|
|
||||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
@@ -88,7 +92,7 @@ define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
|||||||
ret float %rs
|
ret float %rs
|
||||||
}
|
}
|
||||||
|
|
||||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
@@ -102,14 +106,14 @@ define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
|||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||||
|
|
||||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
ret double %rs
|
ret double %rs
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
@@ -118,7 +122,7 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
|||||||
ret double %rs
|
ret double %rs
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
@@ -133,7 +137,7 @@ define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
|||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||||
@@ -154,7 +158,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||||
ret float %ret
|
ret float %ret
|
||||||
}
|
}
|
||||||
@@ -166,7 +170,7 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|||||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||||
|
|
||||||
define void @__fastmath() nounwind alwaysinline {
|
define internal void @__fastmath() nounwind alwaysinline {
|
||||||
%ptr = alloca i32
|
%ptr = alloca i32
|
||||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||||
@@ -185,12 +189,12 @@ define void @__fastmath() nounwind alwaysinline {
|
|||||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||||
ret float %ret
|
ret float %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||||
ret float %ret
|
ret float %ret
|
||||||
}
|
}
|
||||||
@@ -202,12 +206,12 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
|||||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
|
||||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
ret i32 %ret
|
ret i32 %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
ret i32 %ret
|
ret i32 %ret
|
||||||
}
|
}
|
||||||
@@ -219,12 +223,12 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
|||||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
|
||||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
ret i32 %ret
|
ret i32 %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
ret i32 %ret
|
ret i32 %ret
|
||||||
}
|
}
|
||||||
@@ -234,14 +238,14 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
|||||||
|
|
||||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||||
|
|
||||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||||
ret i32 %call
|
ret i32 %call
|
||||||
}
|
}
|
||||||
|
|
||||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||||
|
|
||||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||||
ret i64 %call
|
ret i64 %call
|
||||||
}
|
}
|
||||||
@@ -251,7 +255,7 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
|||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||||
|
|
||||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||||
ret double %ret
|
ret double %ret
|
||||||
}
|
}
|
||||||
@@ -263,12 +267,12 @@ define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
|||||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
|
||||||
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||||
ret double %ret
|
ret double %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||||
ret double %ret
|
ret double %ret
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,6 +29,13 @@
|
|||||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; *** Untested *** AVX target implementation.
|
||||||
|
;;
|
||||||
|
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||||
|
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||||
|
;; chance that there are bugs in the code in this file.
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; Basic 16-wide definitions
|
;; Basic 16-wide definitions
|
||||||
|
|
||||||
@@ -44,7 +51,7 @@ include(`builtins-avx-common.ll')
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
; float iv = __rcp_v(v);
|
; float iv = __rcp_v(v);
|
||||||
; return iv * (2. - v * iv);
|
; return iv * (2. - v * iv);
|
||||||
|
|
||||||
@@ -64,17 +71,17 @@ define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysi
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
round8to16(%0, 8)
|
round8to16(%0, 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
round8to16(%0, 9)
|
round8to16(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
round8to16(%0, 10)
|
round8to16(%0, 10)
|
||||||
}
|
}
|
||||||
@@ -84,15 +91,15 @@ define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly always
|
|||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||||
|
|
||||||
define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||||
round4to16double(%0, 8)
|
round4to16double(%0, 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||||
round4to16double(%0, 9)
|
round4to16double(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||||
round4to16double(%0, 10)
|
round4to16double(%0, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -102,7 +109,7 @@ define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alw
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||||
; float is = __rsqrt_v(v);
|
; float is = __rsqrt_v(v);
|
||||||
unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
|
unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
@@ -125,7 +132,7 @@ define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly al
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
|
unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
|
||||||
ret <16 x float> %call
|
ret <16 x float> %call
|
||||||
}
|
}
|
||||||
@@ -153,13 +160,13 @@ declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
|
|||||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
|
|
||||||
define <16 x float> @__max_varying_float(<16 x float>,
|
define internal <16 x float> @__max_varying_float(<16 x float>,
|
||||||
<16 x float>) nounwind readonly alwaysinline {
|
<16 x float>) nounwind readonly alwaysinline {
|
||||||
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
|
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
|
||||||
ret <16 x float> %call
|
ret <16 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x float> @__min_varying_float(<16 x float>,
|
define internal <16 x float> @__min_varying_float(<16 x float>,
|
||||||
<16 x float>) nounwind readonly alwaysinline {
|
<16 x float>) nounwind readonly alwaysinline {
|
||||||
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
|
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
|
||||||
ret <16 x float> %call
|
ret <16 x float> %call
|
||||||
@@ -169,12 +176,12 @@ define <16 x float> @__min_varying_float(<16 x float>,
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; int min/max
|
;; int min/max
|
||||||
|
|
||||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
ret <16 x i32> %ret
|
ret <16 x i32> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
ret <16 x i32> %ret
|
ret <16 x i32> %ret
|
||||||
}
|
}
|
||||||
@@ -183,12 +190,12 @@ define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; unsigned int min/max
|
;; unsigned int min/max
|
||||||
|
|
||||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
ret <16 x i32> %ret
|
ret <16 x i32> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
ret <16 x i32> %ret
|
ret <16 x i32> %ret
|
||||||
}
|
}
|
||||||
@@ -198,7 +205,7 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
|
|||||||
|
|
||||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
@@ -217,7 +224,7 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
|
|
||||||
define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||||
%va = shufflevector <16 x float> %0, <16 x float> undef,
|
%va = shufflevector <16 x float> %0, <16 x float> undef,
|
||||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
%vb = shufflevector <16 x float> %0, <16 x float> undef,
|
%vb = shufflevector <16 x float> %0, <16 x float> undef,
|
||||||
@@ -232,12 +239,12 @@ define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
|
define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
|
||||||
reduce16(float, @__min_varying_float, @__min_uniform_float)
|
reduce16(float, @__min_varying_float, @__min_uniform_float)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
|
define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
|
||||||
reduce16(float, @__max_varying_float, @__max_uniform_float)
|
reduce16(float, @__max_varying_float, @__max_uniform_float)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -246,28 +253,28 @@ reduce_equal(16)
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; horizontal int32 ops
|
;; horizontal int32 ops
|
||||||
|
|
||||||
define <16 x i32> @__add_varying_int32(<16 x i32>,
|
define internal <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||||
<16 x i32>) nounwind readnone alwaysinline {
|
<16 x i32>) nounwind readnone alwaysinline {
|
||||||
%s = add <16 x i32> %0, %1
|
%s = add <16 x i32> %0, %1
|
||||||
ret <16 x i32> %s
|
ret <16 x i32> %s
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||||
%s = add i32 %0, %1
|
%s = add i32 %0, %1
|
||||||
ret i32 %s
|
ret i32 %s
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
|
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
|
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
|
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -275,17 +282,17 @@ define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;;; horizontal uint32 ops
|
;;; horizontal uint32 ops
|
||||||
|
|
||||||
define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||||
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
|
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
|
||||||
ret i32 %r
|
ret i32 %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -295,7 +302,7 @@ define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
|||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
|
|
||||||
define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
|
define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
|
||||||
%va = shufflevector <16 x double> %0, <16 x double> undef,
|
%va = shufflevector <16 x double> %0, <16 x double> undef,
|
||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
%vb = shufflevector <16 x double> %0, <16 x double> undef,
|
%vb = shufflevector <16 x double> %0, <16 x double> undef,
|
||||||
@@ -315,12 +322,12 @@ define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline
|
|||||||
ret double %sum
|
ret double %sum
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
|
define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
|
||||||
reduce16(double, @__min_varying_double, @__min_uniform_double)
|
reduce16(double, @__min_varying_double, @__min_uniform_double)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
|
define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
|
||||||
reduce16(double, @__max_varying_double, @__max_uniform_double)
|
reduce16(double, @__max_varying_double, @__max_uniform_double)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -328,28 +335,28 @@ define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; horizontal int64 ops
|
;; horizontal int64 ops
|
||||||
|
|
||||||
define <16 x i64> @__add_varying_int64(<16 x i64>,
|
define internal <16 x i64> @__add_varying_int64(<16 x i64>,
|
||||||
<16 x i64>) nounwind readnone alwaysinline {
|
<16 x i64>) nounwind readnone alwaysinline {
|
||||||
%s = add <16 x i64> %0, %1
|
%s = add <16 x i64> %0, %1
|
||||||
ret <16 x i64> %s
|
ret <16 x i64> %s
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||||
%s = add i64 %0, %1
|
%s = add i64 %0, %1
|
||||||
ret i64 %s
|
ret i64 %s
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
|
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
|
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
|
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -357,17 +364,17 @@ define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;;; horizontal uint64 ops
|
;;; horizontal uint64 ops
|
||||||
|
|
||||||
define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||||
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
|
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
|
||||||
ret i64 %r
|
ret i64 %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -635,7 +642,7 @@ gen_scatter(16, i64)
|
|||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||||
|
|
||||||
define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
|
define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
|
||||||
unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||||
ret <16 x double> %ret
|
ret <16 x double> %ret
|
||||||
}
|
}
|
||||||
@@ -647,12 +654,12 @@ define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline
|
|||||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
|
|
||||||
define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||||
binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||||
ret <16 x double> %ret
|
ret <16 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||||
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||||
ret <16 x double> %ret
|
ret <16 x double> %ret
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,6 +29,13 @@
|
|||||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; *** Untested *** AVX target implementation.
|
||||||
|
;;
|
||||||
|
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||||
|
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||||
|
;; chance that there are bugs in the code in this file.
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; Basic 8-wide definitions
|
;; Basic 8-wide definitions
|
||||||
|
|
||||||
@@ -44,7 +51,7 @@ include(`builtins-avx-common.ll')
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; float iv = __rcp_v(v);
|
; float iv = __rcp_v(v);
|
||||||
; return iv * (2. - v * iv);
|
; return iv * (2. - v * iv);
|
||||||
|
|
||||||
@@ -62,19 +69,19 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
|
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
|
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
|
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
@@ -85,17 +92,17 @@ define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysin
|
|||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||||
|
|
||||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
round4to8double(%0, 8)
|
round4to8double(%0, 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||||
round4to8double(%0, 9)
|
round4to8double(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||||
round4to8double(%0, 10)
|
round4to8double(%0, 10)
|
||||||
}
|
}
|
||||||
@@ -106,7 +113,7 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||||
; float is = __rsqrt_v(v);
|
; float is = __rsqrt_v(v);
|
||||||
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
|
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
@@ -125,7 +132,7 @@ define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwa
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
|
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
@@ -153,13 +160,13 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
|
|||||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__max_varying_float(<8 x float>,
|
define internal <8 x float> @__max_varying_float(<8 x float>,
|
||||||
<8 x float>) nounwind readonly alwaysinline {
|
<8 x float>) nounwind readonly alwaysinline {
|
||||||
%call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
|
%call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__min_varying_float(<8 x float>,
|
define internal <8 x float> @__min_varying_float(<8 x float>,
|
||||||
<8 x float>) nounwind readonly alwaysinline {
|
<8 x float>) nounwind readonly alwaysinline {
|
||||||
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
@@ -169,12 +176,12 @@ define <8 x float> @__min_varying_float(<8 x float>,
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; int min/max
|
;; int min/max
|
||||||
|
|
||||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
ret <8 x i32> %ret
|
ret <8 x i32> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
ret <8 x i32> %ret
|
ret <8 x i32> %ret
|
||||||
}
|
}
|
||||||
@@ -183,12 +190,12 @@ define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly al
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; unsigned int min/max
|
;; unsigned int min/max
|
||||||
|
|
||||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
ret <8 x i32> %ret
|
ret <8 x i32> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
ret <8 x i32> %ret
|
ret <8 x i32> %ret
|
||||||
}
|
}
|
||||||
@@ -198,7 +205,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
|
|||||||
|
|
||||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||||
ret i32 %v
|
ret i32 %v
|
||||||
@@ -209,7 +216,7 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
|||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||||
|
|
||||||
define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
|
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
|
||||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||||
@@ -219,12 +226,12 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -233,28 +240,28 @@ reduce_equal(8)
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; horizontal int32 ops
|
;; horizontal int32 ops
|
||||||
|
|
||||||
define <8 x i32> @__add_varying_int32(<8 x i32>,
|
define internal <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||||
<8 x i32>) nounwind readnone alwaysinline {
|
<8 x i32>) nounwind readnone alwaysinline {
|
||||||
%s = add <8 x i32> %0, %1
|
%s = add <8 x i32> %0, %1
|
||||||
ret <8 x i32> %s
|
ret <8 x i32> %s
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||||
%s = add i32 %0, %1
|
%s = add i32 %0, %1
|
||||||
ret i32 %s
|
ret i32 %s
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
|
reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -262,17 +269,17 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;;; horizontal uint32 ops
|
;;; horizontal uint32 ops
|
||||||
|
|
||||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||||
ret i32 %r
|
ret i32 %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -282,7 +289,7 @@ define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
|||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
|
|
||||||
define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
|
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||||
@@ -296,12 +303,12 @@ define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline
|
|||||||
ret double %sum
|
ret double %sum
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
|
define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
|
||||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
|
define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
|
||||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -309,28 +316,28 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; horizontal int64 ops
|
;; horizontal int64 ops
|
||||||
|
|
||||||
define <8 x i64> @__add_varying_int64(<8 x i64>,
|
define internal <8 x i64> @__add_varying_int64(<8 x i64>,
|
||||||
<8 x i64>) nounwind readnone alwaysinline {
|
<8 x i64>) nounwind readnone alwaysinline {
|
||||||
%s = add <8 x i64> %0, %1
|
%s = add <8 x i64> %0, %1
|
||||||
ret <8 x i64> %s
|
ret <8 x i64> %s
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||||
%s = add i64 %0, %1
|
%s = add i64 %0, %1
|
||||||
ret i64 %s
|
ret i64 %s
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
|
reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -338,17 +345,17 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;;; horizontal uint64 ops
|
;;; horizontal uint64 ops
|
||||||
|
|
||||||
define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
||||||
%r = call i64 @__reduce_add_int64(<8 x i64> %v)
|
%r = call i64 @__reduce_add_int64(<8 x i64> %v)
|
||||||
ret i64 %r
|
ret i64 %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -533,7 +540,7 @@ gen_scatter(8, i64)
|
|||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||||
|
|
||||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||||
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
@@ -545,12 +552,12 @@ define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
|||||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||||
|
|
||||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||||
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||||
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
|
|||||||
27
builtins-c.c
27
builtins-c.c
@@ -57,7 +57,6 @@
|
|||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
|
|
||||||
typedef int Bool;
|
typedef int Bool;
|
||||||
@@ -133,8 +132,6 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
|||||||
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||||
case 'd': PRINT_SCALAR("%f", double);
|
case 'd': PRINT_SCALAR("%f", double);
|
||||||
case 'D': PRINT_VECTOR("%f", double);
|
case 'D': PRINT_VECTOR("%f", double);
|
||||||
case 'p': PRINT_SCALAR("%p", void *);
|
|
||||||
case 'P': PRINT_VECTOR("%p", void *);
|
|
||||||
default:
|
default:
|
||||||
printf("UNKNOWN TYPE ");
|
printf("UNKNOWN TYPE ");
|
||||||
putchar(*types);
|
putchar(*types);
|
||||||
@@ -150,21 +147,21 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
|||||||
|
|
||||||
int __num_cores() {
|
int __num_cores() {
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
// This is quite a hack. Including all of windows.h to get this definition
|
// This is quite a hack. Including all of windows.h to get this definition
|
||||||
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||||
// So we don't #include <windows.h> but instead have the equivalent declarations
|
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||||
// here. Presumably this struct declaration won't be changing in the future
|
// here. Presumably this struct declaration won't be changing in the future
|
||||||
// anyway...
|
// anyway...
|
||||||
struct SYSTEM_INFO {
|
struct SYSTEM_INFO {
|
||||||
int pad0[2];
|
int pad0[2];
|
||||||
void *pad1[2];
|
void *pad1[2];
|
||||||
int *pad2;
|
int *pad2;
|
||||||
int dwNumberOfProcessors;
|
int dwNumberOfProcessors;
|
||||||
int pad3[3];
|
int pad3[3];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct SYSTEM_INFO sysInfo;
|
struct SYSTEM_INFO sysInfo;
|
||||||
extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
|
extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
|
||||||
GetSystemInfo(&sysInfo);
|
GetSystemInfo(&sysInfo);
|
||||||
return sysInfo.dwNumberOfProcessors;
|
return sysInfo.dwNumberOfProcessors;
|
||||||
#else
|
#else
|
||||||
|
|||||||
@@ -1,123 +0,0 @@
|
|||||||
;; Copyright (c) 2011, Intel Corporation
|
|
||||||
;; All rights reserved.
|
|
||||||
;;
|
|
||||||
;; Redistribution and use in source and binary forms, with or without
|
|
||||||
;; modification, are permitted provided that the following conditions are
|
|
||||||
;; met:
|
|
||||||
;;
|
|
||||||
;; * Redistributions of source code must retain the above copyright
|
|
||||||
;; notice, this list of conditions and the following disclaimer.
|
|
||||||
;;
|
|
||||||
;; * Redistributions in binary form must reproduce the above copyright
|
|
||||||
;; notice, this list of conditions and the following disclaimer in the
|
|
||||||
;; documentation and/or other materials provided with the distribution.
|
|
||||||
;;
|
|
||||||
;; * Neither the name of Intel Corporation nor the names of its
|
|
||||||
;; contributors may be used to endorse or promote products derived from
|
|
||||||
;; this software without specific prior written permission.
|
|
||||||
;;
|
|
||||||
;;
|
|
||||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
;; This file defines various functions that are used when generating the
|
|
||||||
;; the "dispatch" object/assembly file that has entrypoints for each
|
|
||||||
;; exported function in a module that dispatch to the best available
|
|
||||||
;; variant of that function that will run on the system's CPU.
|
|
||||||
|
|
||||||
;; Stores the best target ISA that the system on which we're actually
|
|
||||||
;; running supports. -1 represents "uninitialized", otherwise this value
|
|
||||||
;; should correspond to one of the enumerant values of Target::ISA from
|
|
||||||
;; ispc.h.
|
|
||||||
|
|
||||||
@__system_best_isa = internal global i32 -1
|
|
||||||
|
|
||||||
declare void @abort() noreturn
|
|
||||||
|
|
||||||
;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
|
|
||||||
;; following code... Specifically, __get_system_isa should return a value
|
|
||||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
|
||||||
;; most capable ISA that the curremt system can run.
|
|
||||||
;;
|
|
||||||
;; #ifdef _MSC_VER
|
|
||||||
;; extern void __stdcall __cpuid(int info[4], int infoType);
|
|
||||||
;; #else
|
|
||||||
;; static void __cpuid(int info[4], int infoType) {
|
|
||||||
;; __asm__ __volatile__ ("cpuid"
|
|
||||||
;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
|
||||||
;; : "0" (infoType));
|
|
||||||
;; }
|
|
||||||
;; #endif
|
|
||||||
;;
|
|
||||||
;; int32_t __get_system_isa() {
|
|
||||||
;; int info[4];
|
|
||||||
;; __cpuid(info, 1);
|
|
||||||
;; /* NOTE: the values returned below must be the same as the
|
|
||||||
;; corresponding enumerant values in Target::ISA. */
|
|
||||||
;; if ((info[2] & (1 << 28)) != 0)
|
|
||||||
;; return 2; // AVX
|
|
||||||
;; else if ((info[2] & (1 << 19)) != 0)
|
|
||||||
;; return 1; // SSE4
|
|
||||||
;; else if ((info[3] & (1 << 26)) != 0)
|
|
||||||
;; return 0; // SSE2
|
|
||||||
;; else
|
|
||||||
;; abort();
|
|
||||||
;; }
|
|
||||||
|
|
||||||
%0 = type { i32, i32, i32, i32 }
|
|
||||||
|
|
||||||
define i32 @__get_system_isa() nounwind ssp {
|
|
||||||
%1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
|
||||||
%2 = extractvalue %0 %1, 2
|
|
||||||
%3 = extractvalue %0 %1, 3
|
|
||||||
%4 = and i32 %2, 268435456
|
|
||||||
%5 = icmp eq i32 %4, 0
|
|
||||||
br i1 %5, label %6, label %13
|
|
||||||
|
|
||||||
; <label>:6 ; preds = %0
|
|
||||||
%7 = and i32 %2, 524288
|
|
||||||
%8 = icmp eq i32 %7, 0
|
|
||||||
br i1 %8, label %9, label %13
|
|
||||||
|
|
||||||
; <label>:9 ; preds = %6
|
|
||||||
%10 = and i32 %3, 67108864
|
|
||||||
%11 = icmp eq i32 %10, 0
|
|
||||||
br i1 %11, label %12, label %13
|
|
||||||
|
|
||||||
; <label>:12 ; preds = %9
|
|
||||||
tail call void @abort() noreturn nounwind
|
|
||||||
unreachable
|
|
||||||
|
|
||||||
; <label>:13 ; preds = %9, %6, %0
|
|
||||||
%.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
|
|
||||||
ret i32 %.0
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;; This function is called by each of the dispatch functions we generate;
|
|
||||||
;; it sets @__system_best_isa if it is unset.
|
|
||||||
|
|
||||||
define void @__set_system_isa() {
|
|
||||||
entry:
|
|
||||||
%bi = load i32* @__system_best_isa
|
|
||||||
%unset = icmp eq i32 %bi, -1
|
|
||||||
br i1 %unset, label %set_system_isa, label %done
|
|
||||||
|
|
||||||
set_system_isa:
|
|
||||||
%bival = call i32 @__get_system_isa()
|
|
||||||
store i32 %bival, i32* @__system_best_isa
|
|
||||||
ret void
|
|
||||||
|
|
||||||
done:
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
417
builtins-sse.ll
Normal file
417
builtins-sse.ll
Normal file
@@ -0,0 +1,417 @@
|
|||||||
|
;; Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
;; All rights reserved.
|
||||||
|
;;
|
||||||
|
;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;; modification, are permitted provided that the following conditions are
|
||||||
|
;; met:
|
||||||
|
;;
|
||||||
|
;; * Redistributions of source code must retain the above copyright
|
||||||
|
;; notice, this list of conditions and the following disclaimer.
|
||||||
|
;;
|
||||||
|
;; * Redistributions in binary form must reproduce the above copyright
|
||||||
|
;; notice, this list of conditions and the following disclaimer in the
|
||||||
|
;; documentation and/or other materials provided with the distribution.
|
||||||
|
;;
|
||||||
|
;; * Neither the name of Intel Corporation nor the names of its
|
||||||
|
;; contributors may be used to endorse or promote products derived from
|
||||||
|
;; this software without specific prior written permission.
|
||||||
|
;;
|
||||||
|
;;
|
||||||
|
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
;; This file declares implementations of various stdlib builtins that
|
||||||
|
;; only require SSE version 1 and 2 functionality; this file, in turn
|
||||||
|
;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
|
||||||
|
;; those definitions for them.
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
int64minmax(4)
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; rcp
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
|
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
||||||
|
; do one N-R iteration to improve precision
|
||||||
|
; float iv = __rcp_v(v);
|
||||||
|
; return iv * (2. - v * iv);
|
||||||
|
%v_iv = fmul <4 x float> %0, %call
|
||||||
|
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||||
|
%iv_mul = fmul <4 x float> %call, %two_minus
|
||||||
|
ret <4 x float> %iv_mul
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; do the rcpss call
|
||||||
|
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||||
|
%scall = extractelement <4 x float> %call, i32 0
|
||||||
|
|
||||||
|
; do one N-R iteration to improve precision, as above
|
||||||
|
%v_iv = fmul float %0, %scall
|
||||||
|
%two_minus = fsub float 2., %v_iv
|
||||||
|
%iv_mul = fmul float %scall, %two_minus
|
||||||
|
ret float %iv_mul
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
; rsqrt
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||||
|
; float is = __rsqrt_v(v);
|
||||||
|
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
||||||
|
; Newton-Raphson iteration to improve precision
|
||||||
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
|
%v_is = fmul <4 x float> %v, %is
|
||||||
|
%v_is_is = fmul <4 x float> %v_is, %is
|
||||||
|
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||||
|
%is_mul = fmul <4 x float> %is, %three_sub
|
||||||
|
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||||
|
ret <4 x float> %half_scale
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||||
|
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||||
|
%is = extractelement <4 x float> %vis, i32 0
|
||||||
|
|
||||||
|
; Newton-Raphson iteration to improve precision
|
||||||
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
|
%v_is = fmul float %0, %is
|
||||||
|
%v_is_is = fmul float %v_is, %is
|
||||||
|
%three_sub = fsub float 3., %v_is_is
|
||||||
|
%is_mul = fmul float %is, %three_sub
|
||||||
|
%half_scale = fmul float 0.5, %is_mul
|
||||||
|
ret float %half_scale
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
; sqrt
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
|
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
||||||
|
ret <4 x float> %call
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; fast math mode
|
||||||
|
|
||||||
|
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||||
|
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||||
|
|
||||||
|
define internal void @__fastmath() nounwind alwaysinline {
|
||||||
|
%ptr = alloca i32
|
||||||
|
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||||
|
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||||
|
%oldval = load i32 *%ptr
|
||||||
|
|
||||||
|
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||||
|
%update = or i32 %oldval, 32832
|
||||||
|
store i32 %update, i32 *%ptr
|
||||||
|
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
; svml stuff
|
||||||
|
|
||||||
|
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
|
||||||
|
define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||||
|
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||||
|
ret <4 x float> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||||
|
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||||
|
ret <4 x float> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||||
|
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||||
|
store <4 x float> %s, <4 x float> * %1
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||||
|
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||||
|
ret <4 x float> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||||
|
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||||
|
ret <4 x float> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||||
|
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||||
|
ret <4 x float> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||||
|
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||||
|
ret <4 x float> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||||
|
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||||
|
ret <4 x float> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||||
|
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||||
|
ret <4 x float> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; float min/max
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||||
|
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
||||||
|
ret <4 x float> %call
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||||
|
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
||||||
|
ret <4 x float> %call
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; double precision sqrt
|
||||||
|
|
||||||
|
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||||
|
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||||
|
ret <4 x double> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||||
|
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; double precision min/max
|
||||||
|
|
||||||
|
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||||
|
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||||
|
ret <4 x double> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal double @__min_uniform_double(double, double) nounwind readnone {
|
||||||
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||||
|
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||||
|
ret <4 x double> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal double @__max_uniform_double(double, double) nounwind readnone {
|
||||||
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
; horizontal ops / reductions
|
||||||
|
|
||||||
|
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||||
|
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||||
|
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||||
|
ret i32 %v
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
|
||||||
|
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||||
|
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
||||||
|
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
||||||
|
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||||
|
%m1 = add <4 x i32> %v1, %v
|
||||||
|
%m1a = extractelement <4 x i32> %m1, i32 0
|
||||||
|
%m1b = extractelement <4 x i32> %m1, i32 1
|
||||||
|
%sum = add i32 %m1a, %m1b
|
||||||
|
ret i32 %sum
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
||||||
|
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||||
|
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||||
|
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||||
|
ret i32 %r
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||||
|
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||||
|
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||||
|
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||||
|
<2 x i32> <i32 0, i32 1>
|
||||||
|
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||||
|
<2 x i32> <i32 2, i32 3>
|
||||||
|
%sum = fadd <2 x double> %v0, %v1
|
||||||
|
%e0 = extractelement <2 x double> %sum, i32 0
|
||||||
|
%e1 = extractelement <2 x double> %sum, i32 1
|
||||||
|
%m = fadd double %e0, %e1
|
||||||
|
ret double %m
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
|
||||||
|
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
|
||||||
|
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
||||||
|
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||||
|
<2 x i32> <i32 0, i32 1>
|
||||||
|
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||||
|
<2 x i32> <i32 2, i32 3>
|
||||||
|
%sum = add <2 x i64> %v0, %v1
|
||||||
|
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||||
|
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||||
|
%m = add i64 %e0, %e1
|
||||||
|
ret i64 %m
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
||||||
|
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
||||||
|
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||||
|
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||||
|
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||||
|
}
|
||||||
|
|
||||||
|
reduce_equal(4)
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; masked store
|
||||||
|
|
||||||
|
masked_store_blend_8_16_by_4()
|
||||||
|
|
||||||
|
gen_masked_store(4, i8, 8)
|
||||||
|
gen_masked_store(4, i16, 16)
|
||||||
|
gen_masked_store(4, i32, 32)
|
||||||
|
gen_masked_store(4, i64, 64)
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; unaligned loads/loads+broadcasts
|
||||||
|
|
||||||
|
load_and_broadcast(4, i8, 8)
|
||||||
|
load_and_broadcast(4, i16, 16)
|
||||||
|
load_and_broadcast(4, i32, 32)
|
||||||
|
load_and_broadcast(4, i64, 64)
|
||||||
|
|
||||||
|
load_masked(4, i8, 8, 1)
|
||||||
|
load_masked(4, i16, 16, 2)
|
||||||
|
load_masked(4, i32, 32, 4)
|
||||||
|
load_masked(4, i64, 64, 8)
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; gather/scatter
|
||||||
|
|
||||||
|
; define these with the macros from stdlib.m4
|
||||||
|
|
||||||
|
gen_gather(4, i8)
|
||||||
|
gen_gather(4, i16)
|
||||||
|
gen_gather(4, i32)
|
||||||
|
gen_gather(4, i64)
|
||||||
|
|
||||||
|
gen_scatter(4, i8)
|
||||||
|
gen_scatter(4, i16)
|
||||||
|
gen_scatter(4, i32)
|
||||||
|
gen_scatter(4, i64)
|
||||||
@@ -1,266 +0,0 @@
|
|||||||
;; Copyright (c) 2010-2011, Intel Corporation
|
|
||||||
;; All rights reserved.
|
|
||||||
;;
|
|
||||||
;; Redistribution and use in source and binary forms, with or without
|
|
||||||
;; modification, are permitted provided that the following conditions are
|
|
||||||
;; met:
|
|
||||||
;;
|
|
||||||
;; * Redistributions of source code must retain the above copyright
|
|
||||||
;; notice, this list of conditions and the following disclaimer.
|
|
||||||
;;
|
|
||||||
;; * Redistributions in binary form must reproduce the above copyright
|
|
||||||
;; notice, this list of conditions and the following disclaimer in the
|
|
||||||
;; documentation and/or other materials provided with the distribution.
|
|
||||||
;;
|
|
||||||
;; * Neither the name of Intel Corporation nor the names of its
|
|
||||||
;; contributors may be used to endorse or promote products derived from
|
|
||||||
;; this software without specific prior written permission.
|
|
||||||
;;
|
|
||||||
;;
|
|
||||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rcp
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; do the rcpss call
|
|
||||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
|
||||||
%scall = extractelement <4 x float> %call, i32 0
|
|
||||||
|
|
||||||
; do one N-R iteration to improve precision, as above
|
|
||||||
%v_iv = fmul float %0, %scall
|
|
||||||
%two_minus = fsub float 2., %v_iv
|
|
||||||
%iv_mul = fmul float %scall, %two_minus
|
|
||||||
ret float %iv_mul
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; rsqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
|
||||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
|
||||||
%is = extractelement <4 x float> %vis, i32 0
|
|
||||||
|
|
||||||
; Newton-Raphson iteration to improve precision
|
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
|
||||||
%v_is = fmul float %0, %is
|
|
||||||
%v_is_is = fmul float %v_is, %is
|
|
||||||
%three_sub = fsub float 3., %v_is_is
|
|
||||||
%is_mul = fmul float %is, %three_sub
|
|
||||||
%half_scale = fmul float 0.5, %is_mul
|
|
||||||
ret float %half_scale
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; sqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
|
|
||||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; fast math mode
|
|
||||||
|
|
||||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
|
||||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
|
||||||
|
|
||||||
define void @__fastmath() nounwind alwaysinline {
|
|
||||||
%ptr = alloca i32
|
|
||||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
|
||||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
|
||||||
%oldval = load i32 *%ptr
|
|
||||||
|
|
||||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
|
||||||
%update = or i32 %oldval, 32832
|
|
||||||
store i32 %update, i32 *%ptr
|
|
||||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; float min/max
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision sqrt
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
|
||||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision min/max
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define double @__min_uniform_double(double, double) nounwind readnone {
|
|
||||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__max_uniform_double(double, double) nounwind readnone {
|
|
||||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rounding
|
|
||||||
;;
|
|
||||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
|
||||||
;; the functionality with multiple instructions...
|
|
||||||
|
|
||||||
; The code for __round_* is the result of compiling the following source
|
|
||||||
; code.
|
|
||||||
;
|
|
||||||
; export float Round(float x) {
|
|
||||||
; unsigned int sign = signbits(x);
|
|
||||||
; unsigned int ix = intbits(x);
|
|
||||||
; ix ^= sign;
|
|
||||||
; x = floatbits(ix);
|
|
||||||
; x += 0x1.0p23f;
|
|
||||||
; x -= 0x1.0p23f;
|
|
||||||
; ix = intbits(x);
|
|
||||||
; ix ^= sign;
|
|
||||||
; x = floatbits(ix);
|
|
||||||
; return x;
|
|
||||||
;}
|
|
||||||
|
|
||||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
|
|
||||||
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
|
|
||||||
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
|
|
||||||
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
|
|
||||||
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
|
|
||||||
%binop21.i = fadd float %binop.i, -8.388608e+06
|
|
||||||
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
|
|
||||||
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
|
|
||||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
|
|
||||||
ret float %int_to_float_bitcast.i.i.i
|
|
||||||
}
|
|
||||||
|
|
||||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
|
||||||
;; bitcode from compiling the following source code...
|
|
||||||
|
|
||||||
;export float Floor(float x) {
|
|
||||||
; float y = Round(x);
|
|
||||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
|
||||||
; float delta = -1.f;
|
|
||||||
; unsigned int idelta = intbits(delta);
|
|
||||||
; idelta &= cmp;
|
|
||||||
; delta = floatbits(idelta);
|
|
||||||
; return y + delta;
|
|
||||||
;}
|
|
||||||
|
|
||||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
|
||||||
%bincmp.i = fcmp ogt float %calltmp.i, %0
|
|
||||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
|
||||||
%bitop.i = and i32 %selectexpr.i, -1082130432
|
|
||||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
|
||||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
|
||||||
ret float %binop.i
|
|
||||||
}
|
|
||||||
|
|
||||||
;; And here is the code we compiled to get the __ceil* functions below
|
|
||||||
;
|
|
||||||
;export uniform float Ceil(uniform float x) {
|
|
||||||
; uniform float y = Round(x);
|
|
||||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
|
||||||
; uniform float delta = 1.f;
|
|
||||||
; uniform int idelta = intbits(delta);
|
|
||||||
; idelta &= yltx;
|
|
||||||
; delta = floatbits(idelta);
|
|
||||||
; return y + delta;
|
|
||||||
;}
|
|
||||||
|
|
||||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
|
||||||
%bincmp.i = fcmp olt float %calltmp.i, %0
|
|
||||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
|
||||||
%bitop.i = and i32 %selectexpr.i, 1065353216
|
|
||||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
|
||||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
|
||||||
ret float %binop.i
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rounding doubles
|
|
||||||
|
|
||||||
declare double @round(double)
|
|
||||||
declare double @floor(double)
|
|
||||||
declare double @ceil(double)
|
|
||||||
|
|
||||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
%r = call double @round(double %0)
|
|
||||||
ret double %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
%r = call double @floor(double %0)
|
|
||||||
ret double %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
%r = call double @ceil(double %0)
|
|
||||||
ret double %r
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; horizontal ops / reductions
|
|
||||||
|
|
||||||
declare i32 @llvm.ctpop.i32(i32)
|
|
||||||
declare i64 @llvm.ctpop.i64(i64)
|
|
||||||
|
|
||||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
|
||||||
%val = call i32 @llvm.ctpop.i32(i32 %0)
|
|
||||||
ret i32 %val
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
|
||||||
%val = call i64 @llvm.ctpop.i64(i64 %0)
|
|
||||||
ret i64 %val
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,631 +0,0 @@
|
|||||||
;; Copyright (c) 2010-2011, Intel Corporation
|
|
||||||
;; All rights reserved.
|
|
||||||
;;
|
|
||||||
;; Redistribution and use in source and binary forms, with or without
|
|
||||||
;; modification, are permitted provided that the following conditions are
|
|
||||||
;; met:
|
|
||||||
;;
|
|
||||||
;; * Redistributions of source code must retain the above copyright
|
|
||||||
;; notice, this list of conditions and the following disclaimer.
|
|
||||||
;;
|
|
||||||
;; * Redistributions in binary form must reproduce the above copyright
|
|
||||||
;; notice, this list of conditions and the following disclaimer in the
|
|
||||||
;; documentation and/or other materials provided with the distribution.
|
|
||||||
;;
|
|
||||||
;; * Neither the name of Intel Corporation nor the names of its
|
|
||||||
;; contributors may be used to endorse or promote products derived from
|
|
||||||
;; this software without specific prior written permission.
|
|
||||||
;;
|
|
||||||
;;
|
|
||||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
|
|
||||||
;; This file defines the target for "double-pumped" SSE2, i.e. running
|
|
||||||
;; with 8-wide vectors
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; standard 8-wide definitions from m4 macros
|
|
||||||
|
|
||||||
stdlib_core(8)
|
|
||||||
packed_load_and_store(8)
|
|
||||||
scans(8)
|
|
||||||
int64minmax(8)
|
|
||||||
|
|
||||||
include(`builtins-sse2-common.ll')
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rcp
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
|
||||||
; float iv = __rcp_v(v);
|
|
||||||
; return iv * (2. - v * iv);
|
|
||||||
|
|
||||||
unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
|
|
||||||
; do one N-R iteration
|
|
||||||
%v_iv = fmul <8 x float> %0, %call
|
|
||||||
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
|
|
||||||
float 2., float 2., float 2., float 2.>, %v_iv
|
|
||||||
%iv_mul = fmul <8 x float> %call, %two_minus
|
|
||||||
ret <8 x float> %iv_mul
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rsqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
|
||||||
; float is = __rsqrt_v(v);
|
|
||||||
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
|
||||||
%v_is = fmul <8 x float> %v, %is
|
|
||||||
%v_is_is = fmul <8 x float> %v_is, %is
|
|
||||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
|
|
||||||
float 3., float 3., float 3., float 3.>, %v_is_is
|
|
||||||
%is_mul = fmul <8 x float> %is, %three_sub
|
|
||||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
|
||||||
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
|
||||||
ret <8 x float> %half_scale
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; sqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
|
||||||
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
|
||||||
ret <8 x float> %call
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; svml stuff
|
|
||||||
|
|
||||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
|
|
||||||
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
unary4to8(ret, float, @__svml_sinf4, %0)
|
|
||||||
ret <8 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
unary4to8(ret, float, @__svml_cosf4, %0)
|
|
||||||
ret <8 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define void @__svml_sincos(<8 x float>, <8 x float> *,
|
|
||||||
<8 x float> *) nounwind readnone alwaysinline {
|
|
||||||
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
|
||||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
|
||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
||||||
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
|
||||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
||||||
|
|
||||||
%cospa = alloca <4 x float>
|
|
||||||
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
|
|
||||||
|
|
||||||
%cospb = alloca <4 x float>
|
|
||||||
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
|
|
||||||
|
|
||||||
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
|
|
||||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
|
||||||
i32 4, i32 5, i32 6, i32 7>
|
|
||||||
store <8 x float> %sin, <8 x float> * %1
|
|
||||||
|
|
||||||
%cosa = load <4 x float> * %cospa
|
|
||||||
%cosb = load <4 x float> * %cospb
|
|
||||||
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
|
|
||||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
|
||||||
i32 4, i32 5, i32 6, i32 7>
|
|
||||||
store <8 x float> %cos, <8 x float> * %2
|
|
||||||
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
unary4to8(ret, float, @__svml_tanf4, %0)
|
|
||||||
ret <8 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
unary4to8(ret, float, @__svml_atanf4, %0)
|
|
||||||
ret <8 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__svml_atan2(<8 x float>,
|
|
||||||
<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
|
||||||
ret <8 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
unary4to8(ret, float, @__svml_expf4, %0)
|
|
||||||
ret <8 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
unary4to8(ret, float, @__svml_logf4, %0)
|
|
||||||
ret <8 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__svml_pow(<8 x float>,
|
|
||||||
<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
|
||||||
ret <8 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; float min/max
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
|
||||||
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
|
||||||
ret <8 x float> %call
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
|
||||||
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
|
||||||
ret <8 x float> %call
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; min/max
|
|
||||||
|
|
||||||
; There is no blend instruction with SSE2, so we simulate it with bit
|
|
||||||
; operations on i32s. For these two vselect functions, for each
|
|
||||||
; vector element, if the mask is on, we return the corresponding value
|
|
||||||
; from %1, and otherwise return the value from %0.
|
|
||||||
|
|
||||||
define <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> ,
|
|
||||||
<8 x i32> %mask) nounwind readnone alwaysinline {
|
|
||||||
%notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
|
||||||
%cleared_old = and <8 x i32> %0, %notmask
|
|
||||||
%masked_new = and <8 x i32> %1, %mask
|
|
||||||
%new = or <8 x i32> %cleared_old, %masked_new
|
|
||||||
ret <8 x i32> %new
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x float> @__vselect_float(<8 x float>, <8 x float>,
|
|
||||||
<8 x i32> %mask) nounwind readnone alwaysinline {
|
|
||||||
%v0 = bitcast <8 x float> %0 to <8 x i32>
|
|
||||||
%v1 = bitcast <8 x float> %1 to <8 x i32>
|
|
||||||
%r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask)
|
|
||||||
%rf = bitcast <8 x i32> %r to <8 x float>
|
|
||||||
ret <8 x float> %rf
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
; To do vector integer min and max, we do the vector compare and then sign
|
|
||||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
|
||||||
; rest...
|
|
||||||
|
|
||||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
|
||||||
%c = icmp slt <8 x i32> %0, %1
|
|
||||||
%mask = sext <8 x i1> %c to <8 x i32>
|
|
||||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
|
||||||
ret <8 x i32> %v
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
%c = icmp slt i32 %0, %1
|
|
||||||
%r = select i1 %c, i32 %0, i32 %1
|
|
||||||
ret i32 %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
|
||||||
%c = icmp sgt <8 x i32> %0, %1
|
|
||||||
%mask = sext <8 x i1> %c to <8 x i32>
|
|
||||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
|
||||||
ret <8 x i32> %v
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
%c = icmp sgt i32 %0, %1
|
|
||||||
%r = select i1 %c, i32 %0, i32 %1
|
|
||||||
ret i32 %r
|
|
||||||
}
|
|
||||||
|
|
||||||
; The functions for unsigned ints are similar, just with unsigned
|
|
||||||
; comparison functions...
|
|
||||||
|
|
||||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
|
||||||
%c = icmp ult <8 x i32> %0, %1
|
|
||||||
%mask = sext <8 x i1> %c to <8 x i32>
|
|
||||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
|
||||||
ret <8 x i32> %v
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
%c = icmp ult i32 %0, %1
|
|
||||||
%r = select i1 %c, i32 %0, i32 %1
|
|
||||||
ret i32 %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
|
||||||
%c = icmp ugt <8 x i32> %0, %1
|
|
||||||
%mask = sext <8 x i1> %c to <8 x i32>
|
|
||||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
|
||||||
ret <8 x i32> %v
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
%c = icmp ugt i32 %0, %1
|
|
||||||
%r = select i1 %c, i32 %0, i32 %1
|
|
||||||
ret i32 %r
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; horizontal ops / reductions
|
|
||||||
|
|
||||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
|
||||||
; first do two 4-wide movmsk calls
|
|
||||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
|
||||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
|
||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
||||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
|
||||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
|
||||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
||||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
|
||||||
|
|
||||||
; and shift the first one over by 4 before ORing it with the value
|
|
||||||
; of the second one
|
|
||||||
%v1s = shl i32 %v1, 4
|
|
||||||
%v = or i32 %v0, %v1s
|
|
||||||
ret i32 %v
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__vec4_add_float(<4 x float> %v0,
|
|
||||||
<4 x float> %v1) nounwind readnone alwaysinline {
|
|
||||||
%v = fadd <4 x float> %v0, %v1
|
|
||||||
ret <4 x float> %v
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__add_float(float, float) nounwind readnone alwaysinline {
|
|
||||||
%v = fadd float %0, %1
|
|
||||||
ret float %v
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
reduce8by4(float, @__vec4_add_float, @__add_float)
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
|
||||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
|
||||||
}
|
|
||||||
|
|
||||||
; helper function for reduce_add_int32
|
|
||||||
define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
|
||||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
|
||||||
%v = add <4 x i32> %v0, %v1
|
|
||||||
ret <4 x i32> %v
|
|
||||||
}
|
|
||||||
|
|
||||||
; helper function for reduce_add_int32
|
|
||||||
define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
|
||||||
%v = add i32 %0, %1
|
|
||||||
ret i32 %v
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
|
||||||
reduce8by4(i32, @__vec4_add_int32, @__add_int32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
|
||||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
|
||||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
|
||||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
|
||||||
ret i32 %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
|
||||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
|
||||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x double> @__add_varying_double(<4 x double>,
|
|
||||||
<4 x double>) nounwind readnone alwaysinline {
|
|
||||||
%r = fadd <4 x double> %0, %1
|
|
||||||
ret <4 x double> %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
|
||||||
%r = fadd double %0, %1
|
|
||||||
ret double %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__reduce_add_double(<8 x double>) nounwind readnone {
|
|
||||||
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__reduce_min_double(<8 x double>) nounwind readnone {
|
|
||||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
|
||||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
|
||||||
<4 x i64>) nounwind readnone alwaysinline {
|
|
||||||
%r = add <4 x i64> %0, %1
|
|
||||||
ret <4 x i64> %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
|
||||||
%r = add i64 %0, %1
|
|
||||||
ret i64 %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
|
||||||
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
|
||||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
|
||||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
|
||||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
|
||||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
|
||||||
}
|
|
||||||
|
|
||||||
reduce_equal(8)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; unaligned loads/loads+broadcasts
|
|
||||||
|
|
||||||
load_and_broadcast(8, i8, 8)
|
|
||||||
load_and_broadcast(8, i16, 16)
|
|
||||||
load_and_broadcast(8, i32, 32)
|
|
||||||
load_and_broadcast(8, i64, 64)
|
|
||||||
|
|
||||||
load_masked(8, i8, 8, 1)
|
|
||||||
load_masked(8, i16, 16, 2)
|
|
||||||
load_masked(8, i32, 32, 4)
|
|
||||||
load_masked(8, i64, 64, 8)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; gather/scatter
|
|
||||||
|
|
||||||
gen_gather(8, i8)
|
|
||||||
gen_gather(8, i16)
|
|
||||||
gen_gather(8, i32)
|
|
||||||
gen_gather(8, i64)
|
|
||||||
|
|
||||||
gen_scatter(8, i8)
|
|
||||||
gen_scatter(8, i16)
|
|
||||||
gen_scatter(8, i32)
|
|
||||||
gen_scatter(8, i64)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; float rounding
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rounding
|
|
||||||
;;
|
|
||||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
|
||||||
;; the functionality with multiple instructions...
|
|
||||||
|
|
||||||
; The code for __round_* is the result of compiling the following source
|
|
||||||
; code.
|
|
||||||
;
|
|
||||||
; export float Round(float x) {
|
|
||||||
; unsigned int sign = signbits(x);
|
|
||||||
; unsigned int ix = intbits(x);
|
|
||||||
; ix ^= sign;
|
|
||||||
; x = floatbits(ix);
|
|
||||||
; x += 0x1.0p23f;
|
|
||||||
; x -= 0x1.0p23f;
|
|
||||||
; ix = intbits(x);
|
|
||||||
; ix ^= sign;
|
|
||||||
; x = floatbits(ix);
|
|
||||||
; return x;
|
|
||||||
;}
|
|
||||||
|
|
||||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
|
||||||
%float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
|
|
||||||
%bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
|
||||||
%bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
|
||||||
%int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
|
||||||
%binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
|
|
||||||
%binop21.i = fadd <8 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
|
|
||||||
%float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
|
|
||||||
%bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
|
||||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
|
|
||||||
ret <8 x float> %int_to_float_bitcast.i.i.i
|
|
||||||
}
|
|
||||||
|
|
||||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
|
||||||
;; bitcode from compiling the following source code...
|
|
||||||
|
|
||||||
;export float Floor(float x) {
|
|
||||||
; float y = Round(x);
|
|
||||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
|
||||||
; float delta = -1.f;
|
|
||||||
; unsigned int idelta = intbits(delta);
|
|
||||||
; idelta &= cmp;
|
|
||||||
; delta = floatbits(idelta);
|
|
||||||
; return y + delta;
|
|
||||||
;}
|
|
||||||
|
|
||||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
|
||||||
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
|
|
||||||
%bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
|
|
||||||
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
|
|
||||||
%bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
|
|
||||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
|
||||||
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
|
||||||
ret <8 x float> %binop.i
|
|
||||||
}
|
|
||||||
|
|
||||||
;; And here is the code we compiled to get the __ceil* functions below
|
|
||||||
;
|
|
||||||
;export uniform float Ceil(uniform float x) {
|
|
||||||
; uniform float y = Round(x);
|
|
||||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
|
||||||
; uniform float delta = 1.f;
|
|
||||||
; uniform int idelta = intbits(delta);
|
|
||||||
; idelta &= yltx;
|
|
||||||
; delta = floatbits(idelta);
|
|
||||||
; return y + delta;
|
|
||||||
;}
|
|
||||||
|
|
||||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
|
||||||
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
|
|
||||||
%bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
|
|
||||||
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
|
|
||||||
%bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
|
|
||||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
|
||||||
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
|
||||||
ret <8 x float> %binop.i
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rounding doubles
|
|
||||||
|
|
||||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
|
||||||
unary1to8(double, @round)
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
|
||||||
unary1to8(double, @floor)
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
|
||||||
unary1to8(double, @ceil)
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; masked store
|
|
||||||
|
|
||||||
gen_masked_store(8, i8, 8)
|
|
||||||
gen_masked_store(8, i16, 16)
|
|
||||||
gen_masked_store(8, i32, 32)
|
|
||||||
gen_masked_store(8, i64, 64)
|
|
||||||
|
|
||||||
masked_store_blend_8_16_by_8()
|
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
|
||||||
<8 x i32> %mask) nounwind alwaysinline {
|
|
||||||
%val = load <8 x i32> * %0, align 4
|
|
||||||
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
|
|
||||||
store <8 x i32> %newval, <8 x i32> * %0, align 4
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
|
||||||
<8 x i32> %mask) nounwind alwaysinline {
|
|
||||||
%oldValue = load <8 x i64>* %ptr, align 8
|
|
||||||
|
|
||||||
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
|
||||||
; are actually bitcast <2 x i64> values
|
|
||||||
;
|
|
||||||
; set up the first two 64-bit values
|
|
||||||
%old0123 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
|
||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
||||||
%old0123f = bitcast <4 x i64> %old0123 to <8 x float>
|
|
||||||
%new0123 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
|
||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
||||||
%new0123f = bitcast <4 x i64> %new0123 to <8 x float>
|
|
||||||
; compute mask--note that the indices are doubled-up
|
|
||||||
%mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
|
||||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
|
||||||
; and blend the first 4 values
|
|
||||||
%result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f,
|
|
||||||
<8 x i32> %mask0123)
|
|
||||||
%result0123 = bitcast <8 x float> %result0123f to <4 x i64>
|
|
||||||
|
|
||||||
; and again
|
|
||||||
%old4567 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
|
||||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
||||||
%old4567f = bitcast <4 x i64> %old4567 to <8 x float>
|
|
||||||
%new4567 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
|
||||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
||||||
%new4567f = bitcast <4 x i64> %new4567 to <8 x float>
|
|
||||||
; compute mask--note that the values are doubled-up
|
|
||||||
%mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
|
||||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
|
||||||
; and blend the two of the values
|
|
||||||
%result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f,
|
|
||||||
<8 x i32> %mask4567)
|
|
||||||
%result4567 = bitcast <8 x float> %result4567f to <4 x i64>
|
|
||||||
|
|
||||||
; reconstruct the final <8 x i64> vector
|
|
||||||
%final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567,
|
|
||||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
||||||
store <8 x i64> %final, <8 x i64> * %ptr, align 8
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision sqrt
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
|
||||||
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
|
||||||
ret <8 x double> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision float min/max
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
|
||||||
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
|
||||||
ret <8 x double> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
|
||||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
|
||||||
ret <8 x double> %ret
|
|
||||||
}
|
|
||||||
380
builtins-sse2.ll
380
builtins-sse2.ll
@@ -36,9 +36,9 @@
|
|||||||
stdlib_core(4)
|
stdlib_core(4)
|
||||||
packed_load_and_store(4)
|
packed_load_and_store(4)
|
||||||
scans(4)
|
scans(4)
|
||||||
int64minmax(4)
|
|
||||||
|
|
||||||
include(`builtins-sse2-common.ll')
|
; Include the various definitions of things that only require SSE1 and SSE2
|
||||||
|
include(`builtins-sse.ll')
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding
|
;; rounding
|
||||||
@@ -62,7 +62,7 @@ include(`builtins-sse2-common.ll')
|
|||||||
; return x;
|
; return x;
|
||||||
;}
|
;}
|
||||||
|
|
||||||
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
%float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
|
%float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
|
||||||
%bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
%bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||||
%bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
%bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||||
@@ -75,6 +75,19 @@ define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysi
|
|||||||
ret <4 x float> %int_to_float_bitcast.i.i.i
|
ret <4 x float> %int_to_float_bitcast.i.i.i
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
|
||||||
|
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
|
||||||
|
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
|
||||||
|
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
|
||||||
|
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
|
||||||
|
%binop21.i = fadd float %binop.i, -8.388608e+06
|
||||||
|
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
|
||||||
|
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||||
|
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
|
||||||
|
ret float %int_to_float_bitcast.i.i.i
|
||||||
|
}
|
||||||
|
|
||||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||||
;; bitcode from compiling the following source code...
|
;; bitcode from compiling the following source code...
|
||||||
|
|
||||||
@@ -88,7 +101,7 @@ define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysi
|
|||||||
; return y + delta;
|
; return y + delta;
|
||||||
;}
|
;}
|
||||||
|
|
||||||
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
||||||
%bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
|
%bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
|
||||||
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
||||||
@@ -98,6 +111,16 @@ define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysi
|
|||||||
ret <4 x float> %binop.i
|
ret <4 x float> %binop.i
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||||
|
%bincmp.i = fcmp ogt float %calltmp.i, %0
|
||||||
|
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||||
|
%bitop.i = and i32 %selectexpr.i, -1082130432
|
||||||
|
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||||
|
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||||
|
ret float %binop.i
|
||||||
|
}
|
||||||
|
|
||||||
;; And here is the code we compiled to get the __ceil* functions below
|
;; And here is the code we compiled to get the __ceil* functions below
|
||||||
;
|
;
|
||||||
;export uniform float Ceil(uniform float x) {
|
;export uniform float Ceil(uniform float x) {
|
||||||
@@ -110,7 +133,7 @@ define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysi
|
|||||||
; return y + delta;
|
; return y + delta;
|
||||||
;}
|
;}
|
||||||
|
|
||||||
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
||||||
%bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
|
%bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
|
||||||
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
||||||
@@ -120,21 +143,50 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
|
|||||||
ret <4 x float> %binop.i
|
ret <4 x float> %binop.i
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||||
|
%bincmp.i = fcmp olt float %calltmp.i, %0
|
||||||
|
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||||
|
%bitop.i = and i32 %selectexpr.i, 1065353216
|
||||||
|
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||||
|
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||||
|
ret float %binop.i
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding doubles
|
;; rounding doubles
|
||||||
|
|
||||||
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
declare double @round(double)
|
||||||
|
declare double @floor(double)
|
||||||
|
declare double @ceil(double)
|
||||||
|
|
||||||
|
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||||
unary1to4(double, @round)
|
unary1to4(double, @round)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
%r = call double @round(double %0)
|
||||||
|
ret double %r
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||||
unary1to4(double, @floor)
|
unary1to4(double, @floor)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
%r = call double @floor(double %0)
|
||||||
|
ret double %r
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||||
unary1to4(double, @ceil)
|
unary1to4(double, @ceil)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
%r = call double @ceil(double %0)
|
||||||
|
ret double %r
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; min/max
|
;; min/max
|
||||||
|
|
||||||
@@ -143,7 +195,7 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
|
|||||||
; vector element, if the mask is on, we return the corresponding value
|
; vector element, if the mask is on, we return the corresponding value
|
||||||
; from %1, and otherwise return the value from %0.
|
; from %1, and otherwise return the value from %0.
|
||||||
|
|
||||||
define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
define internal <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
||||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||||
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
|
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
|
||||||
%cleared_old = and <4 x i32> %0, %notmask
|
%cleared_old = and <4 x i32> %0, %notmask
|
||||||
@@ -152,7 +204,7 @@ define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
|||||||
ret <4 x i32> %new
|
ret <4 x i32> %new
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
define internal <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
||||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||||
%v0 = bitcast <4 x float> %0 to <4 x i32>
|
%v0 = bitcast <4 x float> %0 to <4 x i32>
|
||||||
%v1 = bitcast <4 x float> %1 to <4 x i32>
|
%v1 = bitcast <4 x float> %1 to <4 x i32>
|
||||||
@@ -166,27 +218,27 @@ define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
|||||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||||
; rest...
|
; rest...
|
||||||
|
|
||||||
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||||
%c = icmp slt <4 x i32> %0, %1
|
%c = icmp slt <4 x i32> %0, %1
|
||||||
%mask = sext <4 x i1> %c to <4 x i32>
|
%mask = sext <4 x i1> %c to <4 x i32>
|
||||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||||
ret <4 x i32> %v
|
ret <4 x i32> %v
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
%c = icmp slt i32 %0, %1
|
%c = icmp slt i32 %0, %1
|
||||||
%r = select i1 %c, i32 %0, i32 %1
|
%r = select i1 %c, i32 %0, i32 %1
|
||||||
ret i32 %r
|
ret i32 %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||||
%c = icmp sgt <4 x i32> %0, %1
|
%c = icmp sgt <4 x i32> %0, %1
|
||||||
%mask = sext <4 x i1> %c to <4 x i32>
|
%mask = sext <4 x i1> %c to <4 x i32>
|
||||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||||
ret <4 x i32> %v
|
ret <4 x i32> %v
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
%c = icmp sgt i32 %0, %1
|
%c = icmp sgt i32 %0, %1
|
||||||
%r = select i1 %c, i32 %0, i32 %1
|
%r = select i1 %c, i32 %0, i32 %1
|
||||||
ret i32 %r
|
ret i32 %r
|
||||||
@@ -195,27 +247,27 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
|||||||
; The functions for unsigned ints are similar, just with unsigned
|
; The functions for unsigned ints are similar, just with unsigned
|
||||||
; comparison functions...
|
; comparison functions...
|
||||||
|
|
||||||
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||||
%c = icmp ult <4 x i32> %0, %1
|
%c = icmp ult <4 x i32> %0, %1
|
||||||
%mask = sext <4 x i1> %c to <4 x i32>
|
%mask = sext <4 x i1> %c to <4 x i32>
|
||||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||||
ret <4 x i32> %v
|
ret <4 x i32> %v
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
%c = icmp ult i32 %0, %1
|
%c = icmp ult i32 %0, %1
|
||||||
%r = select i1 %c, i32 %0, i32 %1
|
%r = select i1 %c, i32 %0, i32 %1
|
||||||
ret i32 %r
|
ret i32 %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||||
%c = icmp ugt <4 x i32> %0, %1
|
%c = icmp ugt <4 x i32> %0, %1
|
||||||
%mask = sext <4 x i1> %c to <4 x i32>
|
%mask = sext <4 x i1> %c to <4 x i32>
|
||||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||||
ret <4 x i32> %v
|
ret <4 x i32> %v
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
%c = icmp ugt i32 %0, %1
|
%c = icmp ugt i32 %0, %1
|
||||||
%r = select i1 %c, i32 %0, i32 %1
|
%r = select i1 %c, i32 %0, i32 %1
|
||||||
ret i32 %r
|
ret i32 %r
|
||||||
@@ -225,15 +277,21 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; horizontal ops / reductions
|
; horizontal ops / reductions
|
||||||
|
|
||||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
declare i32 @llvm.ctpop.i32(i32)
|
||||||
|
declare i64 @llvm.ctpop.i64(i64)
|
||||||
|
|
||||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
%val = call i32 @llvm.ctpop.i32(i32 %0)
|
||||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
ret i32 %val
|
||||||
ret i32 %v
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||||
|
%val = call i64 @llvm.ctpop.i64(i64 %0)
|
||||||
|
ret i64 %val
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||||
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
|
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
|
||||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||||
%m1 = fadd <4 x float> %v1, %v
|
%m1 = fadd <4 x float> %v1, %v
|
||||||
@@ -243,96 +301,6 @@ define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline
|
|||||||
ret float %sum
|
ret float %sum
|
||||||
}
|
}
|
||||||
|
|
||||||
define float @__reduce_min_float(<4 x float>) nounwind readnone {
|
|
||||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__reduce_max_float(<4 x float>) nounwind readnone {
|
|
||||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
|
||||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
|
||||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
||||||
%m1 = add <4 x i32> %v1, %v
|
|
||||||
%m1a = extractelement <4 x i32> %m1, i32 0
|
|
||||||
%m1b = extractelement <4 x i32> %m1, i32 1
|
|
||||||
%sum = add i32 %m1a, %m1b
|
|
||||||
ret i32 %sum
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
|
||||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
|
||||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
|
||||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
|
||||||
ret i32 %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
|
||||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
|
||||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
|
||||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
|
||||||
<2 x i32> <i32 0, i32 1>
|
|
||||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
|
||||||
<2 x i32> <i32 2, i32 3>
|
|
||||||
%sum = fadd <2 x double> %v0, %v1
|
|
||||||
%e0 = extractelement <2 x double> %sum, i32 0
|
|
||||||
%e1 = extractelement <2 x double> %sum, i32 1
|
|
||||||
%m = fadd double %e0, %e1
|
|
||||||
ret double %m
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__reduce_min_double(<4 x double>) nounwind readnone {
|
|
||||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__reduce_max_double(<4 x double>) nounwind readnone {
|
|
||||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
|
||||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
|
||||||
<2 x i32> <i32 0, i32 1>
|
|
||||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
|
||||||
<2 x i32> <i32 2, i32 3>
|
|
||||||
%sum = add <2 x i64> %v0, %v1
|
|
||||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
|
||||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
|
||||||
%m = add i64 %e0, %e1
|
|
||||||
ret i64 %m
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
|
||||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
|
||||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
|
||||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
|
||||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
|
||||||
}
|
|
||||||
|
|
||||||
reduce_equal(4)
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
@@ -387,187 +355,3 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rcp
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
|
||||||
; do one N-R iteration to improve precision
|
|
||||||
; float iv = __rcp_v(v);
|
|
||||||
; return iv * (2. - v * iv);
|
|
||||||
%v_iv = fmul <4 x float> %0, %call
|
|
||||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
|
||||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
|
||||||
ret <4 x float> %iv_mul
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; rsqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
|
||||||
; float is = __rsqrt_v(v);
|
|
||||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
|
||||||
; Newton-Raphson iteration to improve precision
|
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
|
||||||
%v_is = fmul <4 x float> %v, %is
|
|
||||||
%v_is_is = fmul <4 x float> %v_is, %is
|
|
||||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
|
||||||
%is_mul = fmul <4 x float> %is, %three_sub
|
|
||||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
|
||||||
ret <4 x float> %half_scale
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; sqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
|
||||||
ret <4 x float> %call
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; svml stuff
|
|
||||||
|
|
||||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
|
|
||||||
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
|
||||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
|
||||||
store <4 x float> %s, <4 x float> * %1
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; float min/max
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
|
||||||
ret <4 x float> %call
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
|
||||||
ret <4 x float> %call
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision sqrt
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
|
||||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
|
||||||
ret <4 x double> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision min/max
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
|
||||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
|
||||||
ret <4 x double> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
|
||||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
|
||||||
ret <4 x double> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; masked store
|
|
||||||
|
|
||||||
masked_store_blend_8_16_by_4()
|
|
||||||
|
|
||||||
gen_masked_store(4, i8, 8)
|
|
||||||
gen_masked_store(4, i16, 16)
|
|
||||||
gen_masked_store(4, i32, 32)
|
|
||||||
gen_masked_store(4, i64, 64)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; unaligned loads/loads+broadcasts
|
|
||||||
|
|
||||||
load_and_broadcast(4, i8, 8)
|
|
||||||
load_and_broadcast(4, i16, 16)
|
|
||||||
load_and_broadcast(4, i32, 32)
|
|
||||||
load_and_broadcast(4, i64, 64)
|
|
||||||
|
|
||||||
load_masked(4, i8, 8, 1)
|
|
||||||
load_masked(4, i16, 16, 2)
|
|
||||||
load_masked(4, i32, 32, 4)
|
|
||||||
load_masked(4, i64, 64, 8)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; gather/scatter
|
|
||||||
|
|
||||||
; define these with the macros from stdlib.m4
|
|
||||||
|
|
||||||
gen_gather(4, i8)
|
|
||||||
gen_gather(4, i16)
|
|
||||||
gen_gather(4, i32)
|
|
||||||
gen_gather(4, i64)
|
|
||||||
|
|
||||||
gen_scatter(4, i8)
|
|
||||||
gen_scatter(4, i16)
|
|
||||||
gen_scatter(4, i32)
|
|
||||||
gen_scatter(4, i64)
|
|
||||||
|
|||||||
@@ -1,271 +0,0 @@
|
|||||||
;; Copyright (c) 2010-2011, Intel Corporation
|
|
||||||
;; All rights reserved.
|
|
||||||
;;
|
|
||||||
;; Redistribution and use in source and binary forms, with or without
|
|
||||||
;; modification, are permitted provided that the following conditions are
|
|
||||||
;; met:
|
|
||||||
;;
|
|
||||||
;; * Redistributions of source code must retain the above copyright
|
|
||||||
;; notice, this list of conditions and the following disclaimer.
|
|
||||||
;;
|
|
||||||
;; * Redistributions in binary form must reproduce the above copyright
|
|
||||||
;; notice, this list of conditions and the following disclaimer in the
|
|
||||||
;; documentation and/or other materials provided with the distribution.
|
|
||||||
;;
|
|
||||||
;; * Neither the name of Intel Corporation nor the names of its
|
|
||||||
;; contributors may be used to endorse or promote products derived from
|
|
||||||
;; this software without specific prior written permission.
|
|
||||||
;;
|
|
||||||
;;
|
|
||||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rounding floats
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
|
||||||
|
|
||||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
|
||||||
; the roundss intrinsic is a total mess--docs say:
|
|
||||||
;
|
|
||||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
|
||||||
;
|
|
||||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
|
||||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
|
||||||
; return value is described by the following equations:
|
|
||||||
;
|
|
||||||
; r0 = RND(b0)
|
|
||||||
; r1 = a1
|
|
||||||
; r2 = a2
|
|
||||||
; r3 = a3
|
|
||||||
;
|
|
||||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
|
||||||
; here. So we pass the same register for both. Further, only the 0th
|
|
||||||
; element of the b parameter matters
|
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
|
||||||
ret float %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; see above for round_ss instrinsic discussion...
|
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
|
|
||||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
|
||||||
ret float %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; see above for round_ss instrinsic discussion...
|
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
|
||||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
|
||||||
ret float %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rounding doubles
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
|
||||||
|
|
||||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
|
||||||
ret double %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
; see above for round_ss instrinsic discussion...
|
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
|
||||||
ret double %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
|
||||||
; see above for round_ss instrinsic discussion...
|
|
||||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
|
||||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
|
||||||
%rs = extractelement <2 x double> %xr, i32 0
|
|
||||||
ret double %rs
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rcp
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; do the rcpss call
|
|
||||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
|
||||||
%scall = extractelement <4 x float> %call, i32 0
|
|
||||||
|
|
||||||
; do one N-R iteration to improve precision, as above
|
|
||||||
%v_iv = fmul float %0, %scall
|
|
||||||
%two_minus = fsub float 2., %v_iv
|
|
||||||
%iv_mul = fmul float %scall, %two_minus
|
|
||||||
ret float %iv_mul
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; rsqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
|
||||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
|
||||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
|
||||||
%is = extractelement <4 x float> %vis, i32 0
|
|
||||||
|
|
||||||
; Newton-Raphson iteration to improve precision
|
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
|
||||||
%v_is = fmul float %0, %is
|
|
||||||
%v_is_is = fmul float %v_is, %is
|
|
||||||
%three_sub = fsub float 3., %v_is_is
|
|
||||||
%is_mul = fmul float %is, %three_sub
|
|
||||||
%half_scale = fmul float 0.5, %is_mul
|
|
||||||
ret float %half_scale
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; sqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
|
||||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; fast math mode
|
|
||||||
|
|
||||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
|
||||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
|
||||||
|
|
||||||
define void @__fastmath() nounwind alwaysinline {
|
|
||||||
%ptr = alloca i32
|
|
||||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
|
||||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
|
||||||
%oldval = load i32 *%ptr
|
|
||||||
|
|
||||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
|
||||||
%update = or i32 %oldval, 32832
|
|
||||||
store i32 %update, i32 *%ptr
|
|
||||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; float min/max
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
|
||||||
ret float %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision sqrt
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
|
||||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision min/max
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define double @__min_uniform_double(double, double) nounwind readnone {
|
|
||||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
define double @__max_uniform_double(double, double) nounwind readnone {
|
|
||||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
|
||||||
ret double %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; int32 min/max
|
|
||||||
|
|
||||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
|
||||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
|
||||||
|
|
||||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
|
||||||
ret i32 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
|
||||||
ret i32 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; unsigned int min/max
|
|
||||||
|
|
||||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
|
||||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
|
||||||
|
|
||||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
|
||||||
ret i32 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
|
||||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
|
||||||
ret i32 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; horizontal ops / reductions
|
|
||||||
|
|
||||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
|
||||||
|
|
||||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
|
||||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
|
||||||
ret i32 %call
|
|
||||||
}
|
|
||||||
|
|
||||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
|
||||||
|
|
||||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
|
||||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
|
||||||
ret i64 %call
|
|
||||||
}
|
|
||||||
402
builtins-sse4.ll
402
builtins-sse4.ll
@@ -36,334 +36,200 @@
|
|||||||
stdlib_core(4)
|
stdlib_core(4)
|
||||||
packed_load_and_store(4)
|
packed_load_and_store(4)
|
||||||
scans(4)
|
scans(4)
|
||||||
int64minmax(4)
|
|
||||||
|
|
||||||
include(`builtins-sse4-common.ll')
|
; Define the stuff that can be done with base SSE1/SSE2 instructions
|
||||||
|
include(`builtins-sse.ll')
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; rcp
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
|
||||||
; do one N-R iteration to improve precision
|
|
||||||
; float iv = __rcp_v(v);
|
|
||||||
; return iv * (2. - v * iv);
|
|
||||||
%v_iv = fmul <4 x float> %0, %call
|
|
||||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
|
||||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
|
||||||
ret <4 x float> %iv_mul
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; rsqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
|
||||||
; float is = __rsqrt_v(v);
|
|
||||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
|
||||||
; Newton-Raphson iteration to improve precision
|
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
|
||||||
%v_is = fmul <4 x float> %v, %is
|
|
||||||
%v_is_is = fmul <4 x float> %v_is, %is
|
|
||||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
|
||||||
%is_mul = fmul <4 x float> %is, %three_sub
|
|
||||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
|
||||||
ret <4 x float> %half_scale
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; sqrt
|
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
|
||||||
ret <4 x float> %call
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; double precision sqrt
|
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
|
||||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
|
||||||
ret <4 x double> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding floats
|
;; rounding floats
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
|
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
|
||||||
ret <4 x float> %call
|
ret <4 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
|
; the roundss intrinsic is a total mess--docs say:
|
||||||
|
;
|
||||||
|
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||||
|
;
|
||||||
|
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||||
|
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||||
|
; return value is described by the following equations:
|
||||||
|
;
|
||||||
|
; r0 = RND(b0)
|
||||||
|
; r1 = a1
|
||||||
|
; r2 = a2
|
||||||
|
; r3 = a3
|
||||||
|
;
|
||||||
|
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||||
|
; here. So we pass the same register for both. Further, only the 0th
|
||||||
|
; element of the b parameter matters
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
||||||
ret <4 x float> %call
|
ret <4 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
||||||
ret <4 x float> %call
|
ret <4 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding doubles
|
;; rounding doubles
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||||
|
|
||||||
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||||
round2to4double(%0, 8)
|
round2to4double(%0, 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
|
ret double %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
round2to4double(%0, 9)
|
round2to4double(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
|
ret double %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
round2to4double(%0, 10)
|
round2to4double(%0, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
;; float min/max
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
ret double %rs
|
||||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
|
||||||
ret <4 x float> %call
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
|
||||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
|
||||||
ret <4 x float> %call
|
|
||||||
}
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; int32 min/max
|
;; int32 min/max
|
||||||
|
|
||||||
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||||
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
|
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
|
||||||
ret <4 x i32> %call
|
ret <4 x i32> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
|
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
|
||||||
ret <4 x i32> %call
|
ret <4 x i32> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; unsigned int min/max
|
; unsigned int min/max
|
||||||
|
|
||||||
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||||
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
|
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
|
||||||
ret <4 x i32> %call
|
ret <4 x i32> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
|
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
|
||||||
ret <4 x i32> %call
|
ret <4 x i32> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
;; double precision min/max
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
||||||
|
|
||||||
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
|
||||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
|
||||||
ret <4 x double> %ret
|
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
|
||||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
|
||||||
ret <4 x double> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
; svml stuff
|
|
||||||
|
|
||||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
|
||||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
|
||||||
|
|
||||||
|
|
||||||
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
|
||||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
|
||||||
store <4 x float> %s, <4 x float> * %1
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
|
||||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
|
||||||
ret <4 x float> %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; horizontal ops / reductions
|
; horizontal ops / reductions
|
||||||
|
|
||||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||||
|
|
||||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
ret i32 %call
|
||||||
ret i32 %v
|
}
|
||||||
|
|
||||||
|
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||||
|
|
||||||
|
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||||
|
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||||
|
ret i64 %call
|
||||||
}
|
}
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
||||||
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
|
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
|
||||||
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
|
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
|
||||||
%scalar = extractelement <4 x float> %v2, i32 0
|
%scalar = extractelement <4 x float> %v2, i32 0
|
||||||
ret float %scalar
|
ret float %scalar
|
||||||
}
|
}
|
||||||
|
|
||||||
define float @__reduce_min_float(<4 x float>) nounwind readnone {
|
|
||||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
|
||||||
}
|
|
||||||
|
|
||||||
define float @__reduce_max_float(<4 x float>) nounwind readnone {
|
|
||||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
|
||||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
|
||||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
|
||||||
%m1 = add <4 x i32> %v1, %v
|
|
||||||
%m1a = extractelement <4 x i32> %m1, i32 0
|
|
||||||
%m1b = extractelement <4 x i32> %m1, i32 1
|
|
||||||
%sum = add i32 %m1a, %m1b
|
|
||||||
ret i32 %sum
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
|
||||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
|
||||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
|
||||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
|
||||||
ret i32 %r
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
|
||||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
|
||||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
|
||||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
|
||||||
<2 x i32> <i32 0, i32 1>
|
|
||||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
|
||||||
<2 x i32> <i32 2, i32 3>
|
|
||||||
%sum = fadd <2 x double> %v0, %v1
|
|
||||||
%e0 = extractelement <2 x double> %sum, i32 0
|
|
||||||
%e1 = extractelement <2 x double> %sum, i32 1
|
|
||||||
%m = fadd double %e0, %e1
|
|
||||||
ret double %m
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__reduce_min_double(<4 x double>) nounwind readnone {
|
|
||||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
|
||||||
}
|
|
||||||
|
|
||||||
define double @__reduce_max_double(<4 x double>) nounwind readnone {
|
|
||||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
|
||||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
|
||||||
<2 x i32> <i32 0, i32 1>
|
|
||||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
|
||||||
<2 x i32> <i32 2, i32 3>
|
|
||||||
%sum = add <2 x i64> %v0, %v1
|
|
||||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
|
||||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
|
||||||
%m = add i64 %e0, %e1
|
|
||||||
ret i64 %m
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
|
||||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
|
||||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
|
||||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
|
||||||
}
|
|
||||||
|
|
||||||
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
|
||||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
|
||||||
}
|
|
||||||
|
|
||||||
reduce_equal(4)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|
||||||
@@ -432,41 +298,3 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
|||||||
store <4 x i64> %final, <4 x i64> * %ptr, align 8
|
store <4 x i64> %final, <4 x i64> * %ptr, align 8
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; masked store
|
|
||||||
|
|
||||||
masked_store_blend_8_16_by_4()
|
|
||||||
|
|
||||||
gen_masked_store(4, i8, 8)
|
|
||||||
gen_masked_store(4, i16, 16)
|
|
||||||
gen_masked_store(4, i32, 32)
|
|
||||||
gen_masked_store(4, i64, 64)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; unaligned loads/loads+broadcasts
|
|
||||||
|
|
||||||
load_and_broadcast(4, i8, 8)
|
|
||||||
load_and_broadcast(4, i16, 16)
|
|
||||||
load_and_broadcast(4, i32, 32)
|
|
||||||
load_and_broadcast(4, i64, 64)
|
|
||||||
|
|
||||||
load_masked(4, i8, 8, 1)
|
|
||||||
load_masked(4, i16, 16, 2)
|
|
||||||
load_masked(4, i32, 32, 4)
|
|
||||||
load_masked(4, i64, 64, 8)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; gather/scatter
|
|
||||||
|
|
||||||
; define these with the macros from stdlib.m4
|
|
||||||
|
|
||||||
gen_gather(4, i8)
|
|
||||||
gen_gather(4, i16)
|
|
||||||
gen_gather(4, i32)
|
|
||||||
gen_gather(4, i64)
|
|
||||||
|
|
||||||
gen_scatter(4, i8)
|
|
||||||
gen_scatter(4, i16)
|
|
||||||
gen_scatter(4, i32)
|
|
||||||
gen_scatter(4, i64)
|
|
||||||
|
|||||||
@@ -41,14 +41,13 @@ packed_load_and_store(8)
|
|||||||
scans(8)
|
scans(8)
|
||||||
int64minmax(8)
|
int64minmax(8)
|
||||||
|
|
||||||
include(`builtins-sse4-common.ll')
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rcp
|
;; rcp
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; float iv = __rcp_v(v);
|
; float iv = __rcp_v(v);
|
||||||
; return iv * (2. - v * iv);
|
; return iv * (2. - v * iv);
|
||||||
|
|
||||||
@@ -61,12 +60,27 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
|
|||||||
ret <8 x float> %iv_mul
|
ret <8 x float> %iv_mul
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; uniform float iv = extract(__rcp_u(v), 0);
|
||||||
|
; return iv * (2. - v * iv);
|
||||||
|
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||||
|
%scall = extractelement <4 x float> %call, i32 0
|
||||||
|
|
||||||
|
; do one N-R iteration
|
||||||
|
%v_iv = fmul float %0, %scall
|
||||||
|
%two_minus = fsub float 2., %v_iv
|
||||||
|
%iv_mul = fmul float %scall, %two_minus
|
||||||
|
ret float %iv_mul
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rsqrt
|
;; rsqrt
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||||
; float is = __rsqrt_v(v);
|
; float is = __rsqrt_v(v);
|
||||||
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
||||||
; return 0.5 * is * (3. - (v * is) * is);
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
@@ -80,16 +94,56 @@ define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwa
|
|||||||
ret <8 x float> %half_scale
|
ret <8 x float> %half_scale
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||||
|
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||||
|
%is = extractelement <4 x float> %vis, i32 0
|
||||||
|
|
||||||
|
; return 0.5 * is * (3. - (v * is) * is);
|
||||||
|
%v_is = fmul float %0, %is
|
||||||
|
%v_is_is = fmul float %v_is, %is
|
||||||
|
%three_sub = fsub float 3., %v_is_is
|
||||||
|
%is_mul = fmul float %is, %three_sub
|
||||||
|
%half_scale = fmul float 0.5, %is_mul
|
||||||
|
ret float %half_scale
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; sqrt
|
;; sqrt
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; fast math
|
||||||
|
|
||||||
|
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||||
|
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||||
|
|
||||||
|
define internal void @__fastmath() nounwind alwaysinline {
|
||||||
|
%ptr = alloca i32
|
||||||
|
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||||
|
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||||
|
%oldval = load i32 *%ptr
|
||||||
|
|
||||||
|
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||||
|
%update = or i32 %oldval, 32832
|
||||||
|
store i32 %update, i32 *%ptr
|
||||||
|
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; svml stuff
|
; svml stuff
|
||||||
|
|
||||||
@@ -104,17 +158,17 @@ declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
|||||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
|
||||||
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||||
unary4to8(ret, float, @__svml_sinf4, %0)
|
unary4to8(ret, float, @__svml_sinf4, %0)
|
||||||
ret <8 x float> %ret
|
ret <8 x float> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||||
unary4to8(ret, float, @__svml_cosf4, %0)
|
unary4to8(ret, float, @__svml_cosf4, %0)
|
||||||
ret <8 x float> %ret
|
ret <8 x float> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__svml_sincos(<8 x float>, <8 x float> *,
|
define internal void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||||
<8 x float> *) nounwind readnone alwaysinline {
|
<8 x float> *) nounwind readnone alwaysinline {
|
||||||
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
||||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||||
@@ -143,33 +197,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||||
unary4to8(ret, float, @__svml_tanf4, %0)
|
unary4to8(ret, float, @__svml_tanf4, %0)
|
||||||
ret <8 x float> %ret
|
ret <8 x float> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||||
unary4to8(ret, float, @__svml_atanf4, %0)
|
unary4to8(ret, float, @__svml_atanf4, %0)
|
||||||
ret <8 x float> %ret
|
ret <8 x float> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__svml_atan2(<8 x float>,
|
define internal <8 x float> @__svml_atan2(<8 x float>,
|
||||||
<8 x float>) nounwind readnone alwaysinline {
|
<8 x float>) nounwind readnone alwaysinline {
|
||||||
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
||||||
ret <8 x float> %ret
|
ret <8 x float> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||||
unary4to8(ret, float, @__svml_expf4, %0)
|
unary4to8(ret, float, @__svml_expf4, %0)
|
||||||
ret <8 x float> %ret
|
ret <8 x float> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||||
unary4to8(ret, float, @__svml_logf4, %0)
|
unary4to8(ret, float, @__svml_logf4, %0)
|
||||||
ret <8 x float> %ret
|
ret <8 x float> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__svml_pow(<8 x float>,
|
define internal <8 x float> @__svml_pow(<8 x float>,
|
||||||
<8 x float>) nounwind readnone alwaysinline {
|
<8 x float>) nounwind readnone alwaysinline {
|
||||||
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
||||||
ret <8 x float> %ret
|
ret <8 x float> %ret
|
||||||
@@ -180,52 +234,91 @@ define <8 x float> @__svml_pow(<8 x float>,
|
|||||||
;; float min/max
|
;; float min/max
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||||
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||||
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
||||||
ret <8 x float> %call
|
ret <8 x float> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||||
|
ret float %ret
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; int32 min/max
|
;; int32 min/max
|
||||||
|
|
||||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
ret <8 x i32> %call
|
ret <8 x i32> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
ret <8 x i32> %call
|
ret <8 x i32> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; unsigned int min/max
|
; unsigned int min/max
|
||||||
|
|
||||||
define <8 x i32> @__min_varying_uint32(<8 x i32>,
|
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||||
|
|
||||||
|
define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||||
<8 x i32>) nounwind readonly alwaysinline {
|
<8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
|
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
ret <8 x i32> %call
|
ret <8 x i32> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x i32> @__max_varying_uint32(<8 x i32>,
|
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||||
<8 x i32>) nounwind readonly alwaysinline {
|
<8 x i32>) nounwind readonly alwaysinline {
|
||||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
ret <8 x i32> %call
|
ret <8 x i32> %call
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; horizontal ops / reductions
|
; horizontal ops / reductions
|
||||||
|
|
||||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||||
|
|
||||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
; first do two 4-wide movmsk calls
|
; first do two 4-wide movmsk calls
|
||||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||||
@@ -242,103 +335,103 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
|||||||
ret i32 %v
|
ret i32 %v
|
||||||
}
|
}
|
||||||
|
|
||||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||||
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
|
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
|
||||||
}
|
}
|
||||||
|
|
||||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||||
reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
|
reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
|
||||||
}
|
}
|
||||||
|
|
||||||
; helper function for reduce_add_int32
|
; helper function for reduce_add_int32
|
||||||
define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||||
%v = add <4 x i32> %v0, %v1
|
%v = add <4 x i32> %v0, %v1
|
||||||
ret <4 x i32> %v
|
ret <4 x i32> %v
|
||||||
}
|
}
|
||||||
|
|
||||||
; helper function for reduce_add_int32
|
; helper function for reduce_add_int32
|
||||||
define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
||||||
%v = add i32 %0, %1
|
%v = add i32 %0, %1
|
||||||
ret i32 %v
|
ret i32 %v
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8by4(i32, @__vec4_add_int32, @__add_int32)
|
reduce8by4(i32, @__vec4_add_int32, @__add_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
|
reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
|
reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||||
ret i32 %r
|
ret i32 %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
|
reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||||
reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
|
reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x double> @__add_varying_double(<4 x double>,
|
define internal <4 x double> @__add_varying_double(<4 x double>,
|
||||||
<4 x double>) nounwind readnone alwaysinline {
|
<4 x double>) nounwind readnone alwaysinline {
|
||||||
%r = fadd <4 x double> %0, %1
|
%r = fadd <4 x double> %0, %1
|
||||||
ret <4 x double> %r
|
ret <4 x double> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||||
%r = fadd double %0, %1
|
%r = fadd double %0, %1
|
||||||
ret double %r
|
ret double %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__reduce_add_double(<8 x double>) nounwind readnone {
|
define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||||
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
|
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__reduce_min_double(<8 x double>) nounwind readnone {
|
define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||||
}
|
}
|
||||||
|
|
||||||
define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
define internal <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||||
<4 x i64>) nounwind readnone alwaysinline {
|
<4 x i64>) nounwind readnone alwaysinline {
|
||||||
%r = add <4 x i64> %0, %1
|
%r = add <4 x i64> %0, %1
|
||||||
ret <4 x i64> %r
|
ret <4 x i64> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||||
%r = add i64 %0, %1
|
%r = add i64 %0, %1
|
||||||
ret i64 %r
|
ret i64 %r
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||||
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
|
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||||
}
|
}
|
||||||
|
|
||||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -374,47 +467,129 @@ gen_scatter(8, i64)
|
|||||||
;; float rounding
|
;; float rounding
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||||
|
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
round4to8(%0, 8)
|
round4to8(%0, 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
|
; the roundss intrinsic is a total mess--docs say:
|
||||||
|
;
|
||||||
|
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||||
|
;
|
||||||
|
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||||
|
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||||
|
; return value is described by the following equations:
|
||||||
|
;
|
||||||
|
; r0 = RND(b0)
|
||||||
|
; r1 = a1
|
||||||
|
; r2 = a2
|
||||||
|
; r3 = a3
|
||||||
|
;
|
||||||
|
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||||
|
; here. So we pass the same register for both.
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
round4to8(%0, 9)
|
round4to8(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
round4to8(%0, 10)
|
round4to8(%0, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||||
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
|
ret float %rs
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding doubles
|
;; rounding doubles
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||||
|
|
||||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
round2to8double(%0, 8)
|
round2to8double(%0, 8)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
|
ret double %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
round2to8double(%0, 9)
|
round2to8double(%0, 9)
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
|
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
|
ret double %rs
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
round2to8double(%0, 10)
|
round2to8double(%0, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||||
|
; see above for round_ss instrinsic discussion...
|
||||||
|
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||||
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
|
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||||
|
%rs = extractelement <2 x double> %xr, i32 0
|
||||||
|
ret double %rs
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
; horizontal ops / reductions
|
; horizontal ops / reductions
|
||||||
|
|
||||||
|
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||||
|
|
||||||
|
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||||
|
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||||
|
ret i32 %call
|
||||||
|
}
|
||||||
|
|
||||||
|
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||||
|
|
||||||
|
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||||
|
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||||
|
ret i64 %call
|
||||||
|
}
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
||||||
@@ -543,24 +718,44 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
|||||||
;; double precision sqrt
|
;; double precision sqrt
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||||
|
|
||||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||||
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||||
|
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; double precision float min/max
|
;; double precision float min/max
|
||||||
|
|
||||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||||
|
|
||||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||||
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||||
|
ret double %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||||
ret <8 x double> %ret
|
ret <8 x double> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||||
|
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||||
|
ret double %ret
|
||||||
|
|
||||||
|
}
|
||||||
432
builtins.cpp
432
builtins.cpp
@@ -114,39 +114,59 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
|||||||
|
|
||||||
// pointers to uniform
|
// pointers to uniform
|
||||||
else if (t == LLVMTypes::Int8PointerType)
|
else if (t == LLVMTypes::Int8PointerType)
|
||||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt8 :
|
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
|
||||||
AtomicType::UniformInt8);
|
AtomicType::UniformInt8, false);
|
||||||
else if (t == LLVMTypes::Int16PointerType)
|
else if (t == LLVMTypes::Int16PointerType)
|
||||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt16 :
|
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
|
||||||
AtomicType::UniformInt16);
|
AtomicType::UniformInt16, false);
|
||||||
else if (t == LLVMTypes::Int32PointerType)
|
else if (t == LLVMTypes::Int32PointerType)
|
||||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt32 :
|
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
|
||||||
AtomicType::UniformInt32);
|
AtomicType::UniformInt32, false);
|
||||||
else if (t == LLVMTypes::Int64PointerType)
|
else if (t == LLVMTypes::Int64PointerType)
|
||||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt64 :
|
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
|
||||||
AtomicType::UniformInt64);
|
AtomicType::UniformInt64, false);
|
||||||
else if (t == LLVMTypes::FloatPointerType)
|
else if (t == LLVMTypes::FloatPointerType)
|
||||||
return PointerType::GetUniform(AtomicType::UniformFloat);
|
return new ReferenceType(AtomicType::UniformFloat, false);
|
||||||
else if (t == LLVMTypes::DoublePointerType)
|
else if (t == LLVMTypes::DoublePointerType)
|
||||||
return PointerType::GetUniform(AtomicType::UniformDouble);
|
return new ReferenceType(AtomicType::UniformDouble, false);
|
||||||
|
|
||||||
// pointers to varying
|
// pointers to varying
|
||||||
else if (t == LLVMTypes::Int8VectorPointerType)
|
else if (t == LLVMTypes::Int8VectorPointerType)
|
||||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt8 :
|
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
|
||||||
AtomicType::VaryingInt8);
|
AtomicType::VaryingInt8, false);
|
||||||
else if (t == LLVMTypes::Int16VectorPointerType)
|
else if (t == LLVMTypes::Int16VectorPointerType)
|
||||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt16 :
|
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
|
||||||
AtomicType::VaryingInt16);
|
AtomicType::VaryingInt16, false);
|
||||||
else if (t == LLVMTypes::Int32VectorPointerType)
|
else if (t == LLVMTypes::Int32VectorPointerType)
|
||||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt32 :
|
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
|
||||||
AtomicType::VaryingInt32);
|
AtomicType::VaryingInt32, false);
|
||||||
else if (t == LLVMTypes::Int64VectorPointerType)
|
else if (t == LLVMTypes::Int64VectorPointerType)
|
||||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt64 :
|
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
|
||||||
AtomicType::VaryingInt64);
|
AtomicType::VaryingInt64, false);
|
||||||
else if (t == LLVMTypes::FloatVectorPointerType)
|
else if (t == LLVMTypes::FloatVectorPointerType)
|
||||||
return PointerType::GetUniform(AtomicType::VaryingFloat);
|
return new ReferenceType(AtomicType::VaryingFloat, false);
|
||||||
else if (t == LLVMTypes::DoubleVectorPointerType)
|
else if (t == LLVMTypes::DoubleVectorPointerType)
|
||||||
return PointerType::GetUniform(AtomicType::VaryingDouble);
|
return new ReferenceType(AtomicType::VaryingDouble, false);
|
||||||
|
|
||||||
|
// arrays
|
||||||
|
else if (llvm::isa<const llvm::PointerType>(t)) {
|
||||||
|
const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
|
||||||
|
|
||||||
|
// Is it a pointer to an unsized array of objects? If so, then
|
||||||
|
// create the equivalent ispc type. Note that it has to be a
|
||||||
|
// reference to an array, since ispc passes arrays to functions by
|
||||||
|
// reference.
|
||||||
|
const llvm::ArrayType *at =
|
||||||
|
llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
|
||||||
|
if (at != NULL) {
|
||||||
|
const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
|
||||||
|
intAsUnsigned);
|
||||||
|
if (eltType == NULL)
|
||||||
|
return NULL;
|
||||||
|
return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@@ -161,9 +181,11 @@ lCreateSymbol(const std::string &name, const Type *returnType,
|
|||||||
noPos.name = "__stdlib";
|
noPos.name = "__stdlib";
|
||||||
|
|
||||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||||
|
// set NULL default arguments
|
||||||
Debug(noPos, "Created builtin symbol \"%s\" [%s]\n", name.c_str(),
|
std::vector<ConstExpr *> defaults;
|
||||||
funcType->GetString().c_str());
|
for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
|
||||||
|
defaults.push_back(NULL);
|
||||||
|
funcType->SetArgumentDefaults(defaults);
|
||||||
|
|
||||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||||
sym->function = func;
|
sym->function = func;
|
||||||
@@ -186,9 +208,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
|||||||
if (name.size() < 3 || name[0] != '_' || name[1] != '_')
|
if (name.size() < 3 || name[0] != '_' || name[1] != '_')
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
Debug(SourcePos(), "Attempting to create ispc symbol for function \"%s\".",
|
|
||||||
name.c_str());
|
|
||||||
|
|
||||||
// An unfortunate hack: we want this builtin function to have the
|
// An unfortunate hack: we want this builtin function to have the
|
||||||
// signature "int __sext_varying_bool(bool)", but the ispc function
|
// signature "int __sext_varying_bool(bool)", but the ispc function
|
||||||
// symbol creation code below assumes that any LLVM vector of i32s is a
|
// symbol creation code below assumes that any LLVM vector of i32s is a
|
||||||
@@ -198,8 +217,11 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
|||||||
const Type *returnType = AtomicType::VaryingInt32;
|
const Type *returnType = AtomicType::VaryingInt32;
|
||||||
std::vector<const Type *> argTypes;
|
std::vector<const Type *> argTypes;
|
||||||
argTypes.push_back(AtomicType::VaryingBool);
|
argTypes.push_back(AtomicType::VaryingBool);
|
||||||
|
std::vector<ConstExpr *> defaults;
|
||||||
|
defaults.push_back(NULL);
|
||||||
|
|
||||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||||
|
funcType->SetArgumentDefaults(defaults);
|
||||||
|
|
||||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||||
sym->function = func;
|
sym->function = func;
|
||||||
@@ -216,27 +238,22 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
|||||||
|
|
||||||
const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
|
const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
|
||||||
intAsUnsigned);
|
intAsUnsigned);
|
||||||
if (returnType == NULL) {
|
if (!returnType)
|
||||||
Debug(SourcePos(), "Failed: return type not representable for "
|
|
||||||
"builtin %s.", name.c_str());
|
|
||||||
// return type not representable in ispc -> not callable from ispc
|
// return type not representable in ispc -> not callable from ispc
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
|
|
||||||
// Iterate over the arguments and try to find their equivalent ispc
|
// Iterate over the arguments and try to find their equivalent ispc
|
||||||
// types. Track if any of the arguments has an integer type.
|
// types. Track if any of the arguments has an integer type.
|
||||||
bool anyIntArgs = false;
|
bool anyIntArgs = false, anyReferenceArgs = false;
|
||||||
std::vector<const Type *> argTypes;
|
std::vector<const Type *> argTypes;
|
||||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
|
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
|
||||||
const llvm::Type *llvmArgType = ftype->getParamType(j);
|
const llvm::Type *llvmArgType = ftype->getParamType(j);
|
||||||
const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
|
const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
|
||||||
if (type == NULL) {
|
if (type == NULL)
|
||||||
Debug(SourcePos(), "Failed: type of parameter %d not "
|
|
||||||
"representable for builtin %s", j, name.c_str());
|
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
anyIntArgs |=
|
anyIntArgs |=
|
||||||
(Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
|
(Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
|
||||||
|
anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
|
||||||
argTypes.push_back(type);
|
argTypes.push_back(type);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -244,6 +261,19 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
|||||||
// so that we get symbols for things with no integer types!
|
// so that we get symbols for things with no integer types!
|
||||||
if (i == 0 || anyIntArgs == true)
|
if (i == 0 || anyIntArgs == true)
|
||||||
lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
|
lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
|
||||||
|
|
||||||
|
// If there are any reference types, also make a variant of the
|
||||||
|
// symbol that has them as const references. This obviously
|
||||||
|
// doesn't make sense for many builtins, but we'll give the stdlib
|
||||||
|
// the option to call one if it needs one.
|
||||||
|
if (anyReferenceArgs == true) {
|
||||||
|
for (unsigned int j = 0; j < argTypes.size(); ++j) {
|
||||||
|
if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
|
||||||
|
argTypes[j] = argTypes[j]->GetAsConstType();
|
||||||
|
lCreateSymbol(name + "_refsconst", returnType, argTypes,
|
||||||
|
ftype, func, symbolTable);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@@ -297,263 +327,6 @@ lCheckModuleIntrinsics(llvm::Module *module) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** We'd like to have all of these functions declared as 'internal' in
|
|
||||||
their respective bitcode files so that if they aren't needed by the
|
|
||||||
user's program they are elimiated from the final output. However, if
|
|
||||||
we do so, then they aren't brought in by the LinkModules() call below
|
|
||||||
since they aren't yet used by anything in the module they're being
|
|
||||||
linked with (in LLVM 3.1, at least).
|
|
||||||
|
|
||||||
Therefore, we don't declare them as internal when we first define them,
|
|
||||||
but instead mark them as internal after they've been linked in. This
|
|
||||||
is admittedly a kludge.
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
lSetInternalFunctions(llvm::Module *module) {
|
|
||||||
const char *names[] = {
|
|
||||||
"__add_uniform_int32",
|
|
||||||
"__add_uniform_int64",
|
|
||||||
"__add_varying_int32",
|
|
||||||
"__add_varying_int64",
|
|
||||||
"__aos_to_soa3_float",
|
|
||||||
"__aos_to_soa3_float16",
|
|
||||||
"__aos_to_soa3_float4",
|
|
||||||
"__aos_to_soa3_float8",
|
|
||||||
"__aos_to_soa3_int32",
|
|
||||||
"__aos_to_soa4_float",
|
|
||||||
"__aos_to_soa4_float16",
|
|
||||||
"__aos_to_soa4_float4",
|
|
||||||
"__aos_to_soa4_float8",
|
|
||||||
"__aos_to_soa4_int32",
|
|
||||||
"__atomic_add_int32_global",
|
|
||||||
"__atomic_add_int64_global",
|
|
||||||
"__atomic_add_uniform_int32_global",
|
|
||||||
"__atomic_add_uniform_int64_global",
|
|
||||||
"__atomic_and_int32_global",
|
|
||||||
"__atomic_and_int64_global",
|
|
||||||
"__atomic_and_uniform_int32_global",
|
|
||||||
"__atomic_and_uniform_int64_global",
|
|
||||||
"__atomic_compare_exchange_double_global",
|
|
||||||
"__atomic_compare_exchange_float_global",
|
|
||||||
"__atomic_compare_exchange_int32_global",
|
|
||||||
"__atomic_compare_exchange_int64_global",
|
|
||||||
"__atomic_compare_exchange_uniform_double_global",
|
|
||||||
"__atomic_compare_exchange_uniform_float_global",
|
|
||||||
"__atomic_compare_exchange_uniform_int32_global",
|
|
||||||
"__atomic_compare_exchange_uniform_int64_global",
|
|
||||||
"__atomic_max_uniform_int32_global",
|
|
||||||
"__atomic_max_uniform_int64_global",
|
|
||||||
"__atomic_min_uniform_int32_global",
|
|
||||||
"__atomic_min_uniform_int64_global",
|
|
||||||
"__atomic_or_int32_global",
|
|
||||||
"__atomic_or_int64_global",
|
|
||||||
"__atomic_or_uniform_int32_global",
|
|
||||||
"__atomic_or_uniform_int64_global",
|
|
||||||
"__atomic_sub_int32_global",
|
|
||||||
"__atomic_sub_int64_global",
|
|
||||||
"__atomic_sub_uniform_int32_global",
|
|
||||||
"__atomic_sub_uniform_int64_global",
|
|
||||||
"__atomic_swap_double_global",
|
|
||||||
"__atomic_swap_float_global",
|
|
||||||
"__atomic_swap_int32_global",
|
|
||||||
"__atomic_swap_int64_global",
|
|
||||||
"__atomic_swap_uniform_double_global",
|
|
||||||
"__atomic_swap_uniform_float_global",
|
|
||||||
"__atomic_swap_uniform_int32_global",
|
|
||||||
"__atomic_swap_uniform_int64_global",
|
|
||||||
"__atomic_umax_uniform_uint32_global",
|
|
||||||
"__atomic_umax_uniform_uint64_global",
|
|
||||||
"__atomic_umin_uniform_uint32_global",
|
|
||||||
"__atomic_umin_uniform_uint64_global",
|
|
||||||
"__atomic_xor_int32_global",
|
|
||||||
"__atomic_xor_int64_global",
|
|
||||||
"__atomic_xor_uniform_int32_global",
|
|
||||||
"__atomic_xor_uniform_int64_global",
|
|
||||||
"__broadcast_double",
|
|
||||||
"__broadcast_float",
|
|
||||||
"__broadcast_int16",
|
|
||||||
"__broadcast_int32",
|
|
||||||
"__broadcast_int64",
|
|
||||||
"__broadcast_int8",
|
|
||||||
"__ceil_uniform_double",
|
|
||||||
"__ceil_uniform_float",
|
|
||||||
"__ceil_varying_double",
|
|
||||||
"__ceil_varying_float",
|
|
||||||
"__count_trailing_zeros_i32",
|
|
||||||
"__count_trailing_zeros_i64",
|
|
||||||
"__count_leading_zeros_i32",
|
|
||||||
"__count_leading_zeros_i64",
|
|
||||||
"__do_assert_uniform",
|
|
||||||
"__do_assert_varying",
|
|
||||||
"__do_print",
|
|
||||||
"__doublebits_uniform_int64",
|
|
||||||
"__doublebits_varying_int64",
|
|
||||||
"__exclusive_scan_add_double",
|
|
||||||
"__exclusive_scan_add_float",
|
|
||||||
"__exclusive_scan_add_i32",
|
|
||||||
"__exclusive_scan_add_i64",
|
|
||||||
"__exclusive_scan_and_i32",
|
|
||||||
"__exclusive_scan_and_i64",
|
|
||||||
"__exclusive_scan_or_i32",
|
|
||||||
"__exclusive_scan_or_i64",
|
|
||||||
"__extract_int16",
|
|
||||||
"__extract_int32",
|
|
||||||
"__extract_int64",
|
|
||||||
"__extract_int8",
|
|
||||||
"__fastmath",
|
|
||||||
"__floatbits_uniform_int32",
|
|
||||||
"__floatbits_varying_int32",
|
|
||||||
"__floor_uniform_double",
|
|
||||||
"__floor_uniform_float",
|
|
||||||
"__floor_varying_double",
|
|
||||||
"__floor_varying_float",
|
|
||||||
"__insert_int16",
|
|
||||||
"__insert_int32",
|
|
||||||
"__insert_int64",
|
|
||||||
"__insert_int8",
|
|
||||||
"__intbits_uniform_double",
|
|
||||||
"__intbits_uniform_float",
|
|
||||||
"__intbits_varying_double",
|
|
||||||
"__intbits_varying_float",
|
|
||||||
"__max_uniform_double",
|
|
||||||
"__max_uniform_float",
|
|
||||||
"__max_uniform_int32",
|
|
||||||
"__max_uniform_int64",
|
|
||||||
"__max_uniform_uint32",
|
|
||||||
"__max_uniform_uint64",
|
|
||||||
"__max_varying_double",
|
|
||||||
"__max_varying_float",
|
|
||||||
"__max_varying_int32",
|
|
||||||
"__max_varying_int64",
|
|
||||||
"__max_varying_uint32",
|
|
||||||
"__max_varying_uint64",
|
|
||||||
"__memory_barrier",
|
|
||||||
"__min_uniform_double",
|
|
||||||
"__min_uniform_float",
|
|
||||||
"__min_uniform_int32",
|
|
||||||
"__min_uniform_int64",
|
|
||||||
"__min_uniform_uint32",
|
|
||||||
"__min_uniform_uint64",
|
|
||||||
"__min_varying_double",
|
|
||||||
"__min_varying_float",
|
|
||||||
"__min_varying_int32",
|
|
||||||
"__min_varying_int64",
|
|
||||||
"__min_varying_uint32",
|
|
||||||
"__min_varying_uint64",
|
|
||||||
"__movmsk",
|
|
||||||
"__num_cores",
|
|
||||||
"__packed_load_active",
|
|
||||||
"__packed_store_active",
|
|
||||||
"__popcnt_int32",
|
|
||||||
"__popcnt_int64",
|
|
||||||
"__prefetch_read_uniform_1",
|
|
||||||
"__prefetch_read_uniform_2",
|
|
||||||
"__prefetch_read_uniform_3",
|
|
||||||
"__prefetch_read_uniform_nt",
|
|
||||||
"__rcp_uniform_float",
|
|
||||||
"__rcp_varying_float",
|
|
||||||
"__reduce_add_double",
|
|
||||||
"__reduce_add_float",
|
|
||||||
"__reduce_add_int32",
|
|
||||||
"__reduce_add_int64",
|
|
||||||
"__reduce_add_uint32",
|
|
||||||
"__reduce_add_uint64",
|
|
||||||
"__reduce_equal_double",
|
|
||||||
"__reduce_equal_float",
|
|
||||||
"__reduce_equal_int32",
|
|
||||||
"__reduce_equal_int64",
|
|
||||||
"__reduce_max_double",
|
|
||||||
"__reduce_max_float",
|
|
||||||
"__reduce_max_int32",
|
|
||||||
"__reduce_max_int64",
|
|
||||||
"__reduce_max_uint32",
|
|
||||||
"__reduce_max_uint64",
|
|
||||||
"__reduce_min_double",
|
|
||||||
"__reduce_min_float",
|
|
||||||
"__reduce_min_int32",
|
|
||||||
"__reduce_min_int64",
|
|
||||||
"__reduce_min_uint32",
|
|
||||||
"__reduce_min_uint64",
|
|
||||||
"__rotate_double",
|
|
||||||
"__rotate_float",
|
|
||||||
"__rotate_int16",
|
|
||||||
"__rotate_int32",
|
|
||||||
"__rotate_int64",
|
|
||||||
"__rotate_int8",
|
|
||||||
"__round_uniform_double",
|
|
||||||
"__round_uniform_float",
|
|
||||||
"__round_varying_double",
|
|
||||||
"__round_varying_float",
|
|
||||||
"__rsqrt_uniform_float",
|
|
||||||
"__rsqrt_varying_float",
|
|
||||||
"__sext_uniform_bool",
|
|
||||||
"__sext_varying_bool",
|
|
||||||
"__shuffle2_double",
|
|
||||||
"__shuffle2_float",
|
|
||||||
"__shuffle2_int16",
|
|
||||||
"__shuffle2_int32",
|
|
||||||
"__shuffle2_int64",
|
|
||||||
"__shuffle2_int8",
|
|
||||||
"__shuffle_double",
|
|
||||||
"__shuffle_float",
|
|
||||||
"__shuffle_int16",
|
|
||||||
"__shuffle_int32",
|
|
||||||
"__shuffle_int64",
|
|
||||||
"__shuffle_int8",
|
|
||||||
"__soa_to_aos3_float",
|
|
||||||
"__soa_to_aos3_float16",
|
|
||||||
"__soa_to_aos3_float4",
|
|
||||||
"__soa_to_aos3_float8",
|
|
||||||
"__soa_to_aos3_int32",
|
|
||||||
"__soa_to_aos4_float",
|
|
||||||
"__soa_to_aos4_float16",
|
|
||||||
"__soa_to_aos4_float4",
|
|
||||||
"__soa_to_aos4_float8",
|
|
||||||
"__soa_to_aos4_int32",
|
|
||||||
"__sqrt_uniform_double",
|
|
||||||
"__sqrt_uniform_float",
|
|
||||||
"__sqrt_varying_double",
|
|
||||||
"__sqrt_varying_float",
|
|
||||||
"__stdlib_atan",
|
|
||||||
"__stdlib_atan2",
|
|
||||||
"__stdlib_atan2f",
|
|
||||||
"__stdlib_atanf",
|
|
||||||
"__stdlib_cos",
|
|
||||||
"__stdlib_cosf",
|
|
||||||
"__stdlib_exp",
|
|
||||||
"__stdlib_expf",
|
|
||||||
"__stdlib_log",
|
|
||||||
"__stdlib_logf",
|
|
||||||
"__stdlib_pow",
|
|
||||||
"__stdlib_powf",
|
|
||||||
"__stdlib_sin",
|
|
||||||
"__stdlib_sincos",
|
|
||||||
"__stdlib_sincosf",
|
|
||||||
"__stdlib_sinf",
|
|
||||||
"__stdlib_tan",
|
|
||||||
"__stdlib_tanf",
|
|
||||||
"__svml_sin",
|
|
||||||
"__svml_cos",
|
|
||||||
"__svml_sincos",
|
|
||||||
"__svml_tan",
|
|
||||||
"__svml_atan",
|
|
||||||
"__svml_atan2",
|
|
||||||
"__svml_exp",
|
|
||||||
"__svml_log",
|
|
||||||
"__svml_pow",
|
|
||||||
"__undef_uniform",
|
|
||||||
"__undef_varying",
|
|
||||||
};
|
|
||||||
|
|
||||||
int count = sizeof(names) / sizeof(names[0]);
|
|
||||||
for (int i = 0; i < count; ++i) {
|
|
||||||
llvm::Function *f = module->getFunction(names[i]);
|
|
||||||
if (f != NULL)
|
|
||||||
f->setLinkage(llvm::GlobalValue::InternalLinkage);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** This utility function takes serialized binary LLVM bitcode and adds its
|
/** This utility function takes serialized binary LLVM bitcode and adds its
|
||||||
definitions to the given module. Functions in the bitcode that can be
|
definitions to the given module. Functions in the bitcode that can be
|
||||||
mapped to ispc functions are also added to the symbol table.
|
mapped to ispc functions are also added to the symbol table.
|
||||||
@@ -563,9 +336,9 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
@param module Module to link the bitcode into
|
@param module Module to link the bitcode into
|
||||||
@param symbolTable Symbol table to add definitions to
|
@param symbolTable Symbol table to add definitions to
|
||||||
*/
|
*/
|
||||||
void
|
static void
|
||||||
AddBitcodeToModule(const unsigned char *bitcode, int length,
|
lAddBitcode(const unsigned char *bitcode, int length,
|
||||||
llvm::Module *module, SymbolTable *symbolTable) {
|
llvm::Module *module, SymbolTable *symbolTable) {
|
||||||
std::string bcErr;
|
std::string bcErr;
|
||||||
llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
|
llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
|
||||||
llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
|
llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
|
||||||
@@ -590,15 +363,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
|||||||
bcModule->setTargetTriple(mTriple.str());
|
bcModule->setTargetTriple(mTriple.str());
|
||||||
|
|
||||||
std::string(linkError);
|
std::string(linkError);
|
||||||
if (llvm::Linker::LinkModules(module, bcModule,
|
if (llvm::Linker::LinkModules(module, bcModule, &linkError))
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
||||||
llvm::Linker::DestroySource,
|
|
||||||
#endif // LLVM_3_0
|
|
||||||
&linkError))
|
|
||||||
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
||||||
lSetInternalFunctions(module);
|
lAddModuleSymbols(module, symbolTable);
|
||||||
if (symbolTable != NULL)
|
|
||||||
lAddModuleSymbols(module, symbolTable);
|
|
||||||
lCheckModuleIntrinsics(module);
|
lCheckModuleIntrinsics(module);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -610,8 +377,8 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
|||||||
static void
|
static void
|
||||||
lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||||
SymbolTable *symbolTable) {
|
SymbolTable *symbolTable) {
|
||||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
|
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
|
||||||
SC_STATIC);
|
pw->isStatic = true;
|
||||||
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
||||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||||
llvm::Constant *linit = LLVMInt32(val);
|
llvm::Constant *linit = LLVMInt32(val);
|
||||||
@@ -628,7 +395,8 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
|||||||
SymbolTable *symbolTable) {
|
SymbolTable *symbolTable) {
|
||||||
std::vector<const Type *> args;
|
std::vector<const Type *> args;
|
||||||
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||||
Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
|
Symbol *sym = new Symbol(name, SourcePos(), ft);
|
||||||
|
sym->isStatic = true;
|
||||||
|
|
||||||
llvm::Function *func = module->getFunction(name);
|
llvm::Function *func = module->getFunction(name);
|
||||||
assert(func != NULL); // it should be declared already...
|
assert(func != NULL); // it should be declared already...
|
||||||
@@ -645,7 +413,8 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
|||||||
static void
|
static void
|
||||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||||
AtomicType::VaryingConstInt32, SC_STATIC);
|
AtomicType::VaryingConstInt32);
|
||||||
|
pidx->isStatic = true;
|
||||||
|
|
||||||
int pi[ISPC_MAX_NVEC];
|
int pi[ISPC_MAX_NVEC];
|
||||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||||
@@ -665,17 +434,17 @@ void
|
|||||||
DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||||
bool includeStdlibISPC) {
|
bool includeStdlibISPC) {
|
||||||
// Add the definitions from the compiled builtins-c.c file
|
// Add the definitions from the compiled builtins-c.c file
|
||||||
if (g->target.is32Bit) {
|
if (g->target.is32bit) {
|
||||||
extern unsigned char builtins_bitcode_c_32[];
|
extern unsigned char builtins_bitcode_c_32[];
|
||||||
extern int builtins_bitcode_c_32_length;
|
extern int builtins_bitcode_c_32_length;
|
||||||
AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||||
module, symbolTable);
|
module, symbolTable);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
extern unsigned char builtins_bitcode_c_64[];
|
extern unsigned char builtins_bitcode_c_64[];
|
||||||
extern int builtins_bitcode_c_64_length;
|
extern int builtins_bitcode_c_64_length;
|
||||||
AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||||
module, symbolTable);
|
module, symbolTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Next, add the target's custom implementations of the various needed
|
// Next, add the target's custom implementations of the various needed
|
||||||
@@ -684,34 +453,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
case Target::SSE2:
|
case Target::SSE2:
|
||||||
extern unsigned char builtins_bitcode_sse2[];
|
extern unsigned char builtins_bitcode_sse2[];
|
||||||
extern int builtins_bitcode_sse2_length;
|
extern int builtins_bitcode_sse2_length;
|
||||||
extern unsigned char builtins_bitcode_sse2_x2[];
|
lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
|
||||||
extern int builtins_bitcode_sse2_x2_length;
|
symbolTable);
|
||||||
switch (g->target.vectorWidth) {
|
|
||||||
case 4:
|
|
||||||
AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length,
|
|
||||||
module, symbolTable);
|
|
||||||
break;
|
|
||||||
case 8:
|
|
||||||
AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length,
|
|
||||||
module, symbolTable);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
FATAL("logic error in DefineStdlib");
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case Target::SSE4:
|
case Target::SSE4:
|
||||||
extern unsigned char builtins_bitcode_sse4[];
|
extern unsigned char builtins_bitcode_sse4[];
|
||||||
extern int builtins_bitcode_sse4_length;
|
extern int builtins_bitcode_sse4_length;
|
||||||
extern unsigned char builtins_bitcode_sse4_x2[];
|
extern unsigned char builtins_bitcode_sse4x2[];
|
||||||
extern int builtins_bitcode_sse4_x2_length;
|
extern int builtins_bitcode_sse4x2_length;
|
||||||
switch (g->target.vectorWidth) {
|
switch (g->target.vectorWidth) {
|
||||||
case 4:
|
case 4:
|
||||||
AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||||
module, symbolTable);
|
module, symbolTable);
|
||||||
break;
|
break;
|
||||||
case 8:
|
case 8:
|
||||||
AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length,
|
lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length,
|
||||||
module, symbolTable);
|
module, symbolTable);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
FATAL("logic error in DefineStdlib");
|
FATAL("logic error in DefineStdlib");
|
||||||
@@ -722,14 +479,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
case 8:
|
case 8:
|
||||||
extern unsigned char builtins_bitcode_avx[];
|
extern unsigned char builtins_bitcode_avx[];
|
||||||
extern int builtins_bitcode_avx_length;
|
extern int builtins_bitcode_avx_length;
|
||||||
AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length,
|
lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module,
|
||||||
module, symbolTable);
|
symbolTable);
|
||||||
break;
|
break;
|
||||||
case 16:
|
case 16:
|
||||||
extern unsigned char builtins_bitcode_avx_x2[];
|
extern unsigned char builtins_bitcode_avx_x2[];
|
||||||
extern int builtins_bitcode_avx_x2_length;
|
extern int builtins_bitcode_avx_x2_length;
|
||||||
AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||||
module, symbolTable);
|
module, symbolTable);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
FATAL("logic error in DefineStdlib");
|
FATAL("logic error in DefineStdlib");
|
||||||
@@ -765,8 +522,11 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
// definitions added. Disable emission of performance warnings for
|
// definitions added. Disable emission of performance warnings for
|
||||||
// now, since the user doesn't care about any of that in the stdlib
|
// now, since the user doesn't care about any of that in the stdlib
|
||||||
// implementation...
|
// implementation...
|
||||||
|
bool epf = g->emitPerfWarnings;
|
||||||
|
g->emitPerfWarnings = false;
|
||||||
extern char stdlib_code[];
|
extern char stdlib_code[];
|
||||||
yy_scan_string(stdlib_code);
|
yy_scan_string(stdlib_code);
|
||||||
yyparse();
|
yyparse();
|
||||||
|
g->emitPerfWarnings = epf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -55,7 +55,4 @@
|
|||||||
void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||||
bool includeStdlib);
|
bool includeStdlib);
|
||||||
|
|
||||||
void AddBitcodeToModule(const unsigned char *bitcode, int length,
|
|
||||||
llvm::Module *module, SymbolTable *symbolTable = NULL);
|
|
||||||
|
|
||||||
#endif // ISPC_STDLIB_H
|
#endif // ISPC_STDLIB_H
|
||||||
|
|||||||
1220
builtins.m4
1220
builtins.m4
File diff suppressed because it is too large
Load Diff
195
ctx.h
195
ctx.h
@@ -41,7 +41,9 @@
|
|||||||
#include "ispc.h"
|
#include "ispc.h"
|
||||||
#include <llvm/InstrTypes.h>
|
#include <llvm/InstrTypes.h>
|
||||||
#include <llvm/Instructions.h>
|
#include <llvm/Instructions.h>
|
||||||
|
#ifndef LLVM_2_8
|
||||||
#include <llvm/Analysis/DIBuilder.h>
|
#include <llvm/Analysis/DIBuilder.h>
|
||||||
|
#endif
|
||||||
#include <llvm/Analysis/DebugInfo.h>
|
#include <llvm/Analysis/DebugInfo.h>
|
||||||
|
|
||||||
struct CFInfo;
|
struct CFInfo;
|
||||||
@@ -57,22 +59,17 @@ struct CFInfo;
|
|||||||
class FunctionEmitContext {
|
class FunctionEmitContext {
|
||||||
public:
|
public:
|
||||||
/** Create a new FunctionEmitContext.
|
/** Create a new FunctionEmitContext.
|
||||||
@param function The Function object representing the function
|
@param returnType The return type of the function
|
||||||
@param funSym Symbol that corresponds to the function
|
@param function LLVM function in the current module that corresponds
|
||||||
@param llvmFunction LLVM function in the current module that corresponds
|
|
||||||
to the function
|
to the function
|
||||||
|
@param funSym Symbol that corresponds to the function
|
||||||
@param firstStmtPos Source file position of the first statement in the
|
@param firstStmtPos Source file position of the first statement in the
|
||||||
function
|
function
|
||||||
*/
|
*/
|
||||||
FunctionEmitContext(Function *function, Symbol *funSym,
|
FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
|
||||||
llvm::Function *llvmFunction,
|
|
||||||
SourcePos firstStmtPos);
|
SourcePos firstStmtPos);
|
||||||
~FunctionEmitContext();
|
~FunctionEmitContext();
|
||||||
|
|
||||||
/** Returns the Function * corresponding to the function that we're
|
|
||||||
currently generating code for. */
|
|
||||||
const Function *GetFunction() const;
|
|
||||||
|
|
||||||
/** @name Current basic block management
|
/** @name Current basic block management
|
||||||
@{
|
@{
|
||||||
*/
|
*/
|
||||||
@@ -86,33 +83,20 @@ public:
|
|||||||
/** @name Mask management
|
/** @name Mask management
|
||||||
@{
|
@{
|
||||||
*/
|
*/
|
||||||
/** Returns the mask value at entry to the current function. */
|
/** Returns the current mask value */
|
||||||
llvm::Value *GetFunctionMask();
|
llvm::Value *GetMask();
|
||||||
|
|
||||||
/** Returns the mask value corresponding to "varying" control flow
|
|
||||||
within the current function. (i.e. this doesn't include the effect
|
|
||||||
of the mask at function entry. */
|
|
||||||
llvm::Value *GetInternalMask();
|
|
||||||
|
|
||||||
/** Returns the complete current mask value--i.e. the logical AND of
|
|
||||||
the function entry mask and the internal mask. */
|
|
||||||
llvm::Value *GetFullMask();
|
|
||||||
|
|
||||||
/** Provides the alloca'd pointer to memory to store the full function
|
|
||||||
mask. This is only used to wire up the __mask builtin variable. */
|
|
||||||
void SetMaskPointer(llvm::Value *p);
|
|
||||||
|
|
||||||
/** Provides the value of the mask at function entry */
|
/** Provides the value of the mask at function entry */
|
||||||
void SetFunctionMask(llvm::Value *val);
|
void SetEntryMask(llvm::Value *val);
|
||||||
|
|
||||||
/** Sets the internal mask to a new value */
|
/** Sets the mask to a new value */
|
||||||
void SetInternalMask(llvm::Value *val);
|
void SetMask(llvm::Value *val);
|
||||||
|
|
||||||
/** Sets the internal mask to (oldMask & val) */
|
/** Sets the mask to (oldMask & val) */
|
||||||
void SetInternalMaskAnd(llvm::Value *oldMask, llvm::Value *val);
|
void MaskAnd(llvm::Value *oldMask, llvm::Value *val);
|
||||||
|
|
||||||
/** Sets the internal mask to (oldMask & ~val) */
|
/** Sets the mask to (oldMask & ~val) */
|
||||||
void SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *test);
|
void MaskAndNot(llvm::Value *oldMask, llvm::Value *test);
|
||||||
|
|
||||||
/** Emits a branch instruction to the basic block btrue if any of the
|
/** Emits a branch instruction to the basic block btrue if any of the
|
||||||
lanes of current mask are on and bfalse if none are on. */
|
lanes of current mask are on and bfalse if none are on. */
|
||||||
@@ -131,8 +115,9 @@ public:
|
|||||||
@{
|
@{
|
||||||
*/
|
*/
|
||||||
/** Notifies the FunctionEmitContext that we're starting emission of an
|
/** Notifies the FunctionEmitContext that we're starting emission of an
|
||||||
'if' statement with a uniform test. */
|
'if' statement with a uniform test. The value of the mask going
|
||||||
void StartUniformIf();
|
into the 'if' statement is provided in the oldMask parameter. */
|
||||||
|
void StartUniformIf(llvm::Value *oldMask);
|
||||||
|
|
||||||
/** Notifies the FunctionEmitContext that we're starting emission of an
|
/** Notifies the FunctionEmitContext that we're starting emission of an
|
||||||
'if' statement with a varying test. The value of the mask going
|
'if' statement with a varying test. The value of the mask going
|
||||||
@@ -147,9 +132,10 @@ public:
|
|||||||
for a loop. Basic blocks are provides for where 'break' and
|
for a loop. Basic blocks are provides for where 'break' and
|
||||||
'continue' statements should jump to (if all running lanes want to
|
'continue' statements should jump to (if all running lanes want to
|
||||||
break or continue), uniformControlFlow indicates whether the loop
|
break or continue), uniformControlFlow indicates whether the loop
|
||||||
condition is 'uniform'. */
|
condition is 'uniform', and oldMask provides the current mask going
|
||||||
|
into the loop. */
|
||||||
void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
|
void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
|
||||||
bool uniformControlFlow);
|
bool uniformControlFlow, llvm::Value *oldMask);
|
||||||
|
|
||||||
/** Informs FunctionEmitContext of the value of the mask at the start
|
/** Informs FunctionEmitContext of the value of the mask at the start
|
||||||
of a loop body. */
|
of a loop body. */
|
||||||
@@ -159,13 +145,6 @@ public:
|
|||||||
finished. */
|
finished. */
|
||||||
void EndLoop();
|
void EndLoop();
|
||||||
|
|
||||||
/** Indicates that code generation for a 'foreach' or 'foreach_tiled'
|
|
||||||
loop is about to start. The provided basic block pointer indicates
|
|
||||||
where control flow should go if a 'continue' statement is executed
|
|
||||||
in the loop. */
|
|
||||||
void StartForeach(llvm::BasicBlock *continueTarget);
|
|
||||||
void EndForeach();
|
|
||||||
|
|
||||||
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
|
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
|
||||||
is true, then if we're in a 'varying' loop, code will be emitted to
|
is true, then if we're in a 'varying' loop, code will be emitted to
|
||||||
see if all of the lanes want to break, in which case a jump to the
|
see if all of the lanes want to break, in which case a jump to the
|
||||||
@@ -190,8 +169,6 @@ public:
|
|||||||
flow */
|
flow */
|
||||||
int VaryingCFDepth() const;
|
int VaryingCFDepth() const;
|
||||||
|
|
||||||
bool InForeachLoop() const;
|
|
||||||
|
|
||||||
/** Called to generate code for 'return' statement; value is the
|
/** Called to generate code for 'return' statement; value is the
|
||||||
expression in the return statement (if non-NULL), and
|
expression in the return statement (if non-NULL), and
|
||||||
doCoherenceCheck indicates whether instructions should be generated
|
doCoherenceCheck indicates whether instructions should be generated
|
||||||
@@ -233,6 +210,9 @@ public:
|
|||||||
i32. */
|
i32. */
|
||||||
llvm::Value *I1VecToBoolVec(llvm::Value *b);
|
llvm::Value *I1VecToBoolVec(llvm::Value *b);
|
||||||
|
|
||||||
|
/** Returns the size of the given type. */
|
||||||
|
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
|
||||||
|
|
||||||
/** If the user has asked to compile the program with instrumentation,
|
/** If the user has asked to compile the program with instrumentation,
|
||||||
this inserts a callback to the user-supplied instrumentation
|
this inserts a callback to the user-supplied instrumentation
|
||||||
function at the current point in the code. */
|
function at the current point in the code. */
|
||||||
@@ -316,18 +296,12 @@ public:
|
|||||||
llvm::CmpInst::Predicate pred,
|
llvm::CmpInst::Predicate pred,
|
||||||
llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
|
llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
|
||||||
|
|
||||||
/** Given a scalar value, return a vector of the same type (or an
|
|
||||||
array, for pointer types). */
|
|
||||||
llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
|
|
||||||
|
|
||||||
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||||
const char *name = NULL);
|
const char *name = NULL);
|
||||||
llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
|
|
||||||
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||||
const char *name = NULL);
|
const char *name = NULL);
|
||||||
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||||
const char *name = NULL);
|
const char *name = NULL);
|
||||||
|
|
||||||
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||||
const char *name = NULL);
|
const char *name = NULL);
|
||||||
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||||
@@ -339,37 +313,26 @@ public:
|
|||||||
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||||
const char *name = NULL);
|
const char *name = NULL);
|
||||||
|
|
||||||
/** These GEP methods are generalizations of the standard ones in LLVM;
|
/** This GEP method is a generalization of the standard one in LLVM; it
|
||||||
they support both uniform and varying basePtr values as well as
|
supports both uniform and varying basePtr values (an array of
|
||||||
uniform and varying index values (arrays of indices). Varying base
|
pointers) as well as uniform and varying index values (arrays of
|
||||||
pointers are expected to come in as vectors of i32/i64 (depending
|
indices). */
|
||||||
on the target), since LLVM doesn't currently support vectors of
|
|
||||||
pointers. The underlying type of the base pointer must be provided
|
|
||||||
via the ptrType parameter */
|
|
||||||
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
|
|
||||||
const Type *ptrType, const char *name = NULL);
|
|
||||||
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
|
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
|
||||||
llvm::Value *index1, const Type *ptrType,
|
llvm::Value *index1, const char *name = NULL);
|
||||||
|
|
||||||
|
/** This is a convenience method to generate a GEP instruction with
|
||||||
|
indices with values with known constant values as the ispc program
|
||||||
|
is being compiled. */
|
||||||
|
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
|
||||||
const char *name = NULL);
|
const char *name = NULL);
|
||||||
|
|
||||||
/** This method returns a new pointer that represents offsetting the
|
/** Load from the memory location(s) given by lvalue. The lvalue may
|
||||||
given base pointer to point at the given element number of the
|
be varying, in which case this corresponds to a gather from the
|
||||||
structure type that the base pointer points to. (The provided
|
multiple memory locations given by the array of pointer values
|
||||||
pointer must be a pointer to a structure type. The ptrType gives
|
given by the lvalue. If the lvalue is not varying, then the type
|
||||||
the type of the pointer, though it may be NULL if the base pointer
|
parameter may be NULL. */
|
||||||
is uniform. */
|
llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type,
|
||||||
llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
|
const char *name = NULL);
|
||||||
const Type *ptrType, const char *name = NULL);
|
|
||||||
|
|
||||||
/** Load from the memory location(s) given by lvalue, using the given
|
|
||||||
mask. The lvalue may be varying, in which case this corresponds to
|
|
||||||
a gather from the multiple memory locations given by the array of
|
|
||||||
pointer values given by the lvalue. If the lvalue is not varying,
|
|
||||||
then both the mask pointer and the type pointer may be NULL. */
|
|
||||||
llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
|
|
||||||
const Type *ptrType, const char *name = NULL);
|
|
||||||
|
|
||||||
llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);
|
|
||||||
|
|
||||||
/** Emits an alloca instruction to allocate stack storage for the given
|
/** Emits an alloca instruction to allocate stack storage for the given
|
||||||
type. If a non-zero alignment is specified, the object is also
|
type. If a non-zero alignment is specified, the object is also
|
||||||
@@ -377,20 +340,21 @@ public:
|
|||||||
instruction is added at the start of the function in the entry
|
instruction is added at the start of the function in the entry
|
||||||
basic block; if it should be added to the current basic block, then
|
basic block; if it should be added to the current basic block, then
|
||||||
the atEntryBlock parameter should be false. */
|
the atEntryBlock parameter should be false. */
|
||||||
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType,
|
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
|
||||||
const char *name = NULL, int align = 0,
|
int align = 0, bool atEntryBlock = true);
|
||||||
bool atEntryBlock = true);
|
|
||||||
|
|
||||||
/** Standard store instruction; for this variant, the lvalue must be a
|
/** Standard store instruction; for this variant, the lvalue must be a
|
||||||
single pointer, not a varying lvalue. */
|
single pointer, not a varying lvalue. */
|
||||||
void StoreInst(llvm::Value *value, llvm::Value *ptr);
|
void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||||
|
const char *name = NULL);
|
||||||
|
|
||||||
/** In this variant of StoreInst(), the lvalue may be varying. If so,
|
/** In this variant of StoreInst(), the lvalue may be varying. If so,
|
||||||
this corresponds to a scatter. Whether the lvalue is uniform of
|
this corresponds to a scatter. Whether the lvalue is uniform of
|
||||||
varying, the given storeMask is used to mask the stores so that
|
varying, the given storeMask is used to mask the stores so that
|
||||||
they only execute for the active program instances. */
|
they only execute for the active program instances. */
|
||||||
void StoreInst(llvm::Value *value, llvm::Value *ptr,
|
void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||||
llvm::Value *storeMask, const Type *ptrType);
|
llvm::Value *storeMask, const Type *rvalueType,
|
||||||
|
const char *name = NULL);
|
||||||
|
|
||||||
void BranchInst(llvm::BasicBlock *block);
|
void BranchInst(llvm::BasicBlock *block);
|
||||||
void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
|
void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
|
||||||
@@ -412,30 +376,24 @@ public:
|
|||||||
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
|
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
|
||||||
llvm::Value *val1, const char *name = NULL);
|
llvm::Value *val1, const char *name = NULL);
|
||||||
|
|
||||||
/** Emits IR to do a function call with the given arguments. If the
|
llvm::Instruction *CallInst(llvm::Function *func,
|
||||||
function type is a varying function pointer type, its full type
|
const std::vector<llvm::Value *> &args,
|
||||||
must be provided in funcType. funcType can be NULL if func is a
|
const char *name = NULL);
|
||||||
uniform function pointer. */
|
|
||||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
|
||||||
const std::vector<llvm::Value *> &args,
|
|
||||||
const char *name = NULL);
|
|
||||||
|
|
||||||
/** This is a convenience method that issues a call instruction to a
|
/** This is a convenience method that issues a call instruction to a
|
||||||
function that takes just a single argument. */
|
function that takes just a single argument. */
|
||||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg,
|
||||||
llvm::Value *arg, const char *name = NULL);
|
const char *name = NULL);
|
||||||
|
|
||||||
/** This is a convenience method that issues a call instruction to a
|
/** This is a convenience method that issues a call instruction to a
|
||||||
function that takes two arguments. */
|
function that takes two arguments. */
|
||||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0,
|
||||||
llvm::Value *arg0, llvm::Value *arg1,
|
llvm::Value *arg1, const char *name = NULL);
|
||||||
const char *name = NULL);
|
|
||||||
|
|
||||||
/** Launch an asynchronous task to run the given function, passing it
|
/** Launch an asynchronous task to run the given function, passing it
|
||||||
he given argument values. */
|
he given argument values. */
|
||||||
llvm::Value *LaunchInst(llvm::Value *callee,
|
llvm::Instruction *LaunchInst(llvm::Function *callee,
|
||||||
std::vector<llvm::Value *> &argVals,
|
std::vector<llvm::Value *> &argVals,
|
||||||
llvm::Value *launchCount);
|
llvm::Value *launchCount);
|
||||||
|
|
||||||
void SyncInst();
|
void SyncInst();
|
||||||
|
|
||||||
@@ -443,9 +401,6 @@ public:
|
|||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/** Pointer to the Function for which we're currently generating code. */
|
|
||||||
Function *function;
|
|
||||||
|
|
||||||
/** The basic block into which we add any alloca instructions that need
|
/** The basic block into which we add any alloca instructions that need
|
||||||
to go at the very start of the function. */
|
to go at the very start of the function. */
|
||||||
llvm::BasicBlock *allocaBlock;
|
llvm::BasicBlock *allocaBlock;
|
||||||
@@ -455,16 +410,8 @@ private:
|
|||||||
llvm::BasicBlock *bblock;
|
llvm::BasicBlock *bblock;
|
||||||
|
|
||||||
/** Pointer to stack-allocated memory that stores the current value of
|
/** Pointer to stack-allocated memory that stores the current value of
|
||||||
the full program mask. */
|
the program mask. */
|
||||||
llvm::Value *fullMaskPointer;
|
llvm::Value *maskPtr;
|
||||||
|
|
||||||
/** Pointer to stack-allocated memory that stores the current value of
|
|
||||||
the program mask representing varying control flow within the
|
|
||||||
function. */
|
|
||||||
llvm::Value *internalMaskPointer;
|
|
||||||
|
|
||||||
/** Value of the program mask when the function starts execution. */
|
|
||||||
llvm::Value *functionMaskValue;
|
|
||||||
|
|
||||||
/** Current source file position; if debugging information is being
|
/** Current source file position; if debugging information is being
|
||||||
generated, this position is used to set file/line information for
|
generated, this position is used to set file/line information for
|
||||||
@@ -475,6 +422,12 @@ private:
|
|||||||
for error messages and debugging symbols. */
|
for error messages and debugging symbols. */
|
||||||
SourcePos funcStartPos;
|
SourcePos funcStartPos;
|
||||||
|
|
||||||
|
/** Type of result that the current function returns. */
|
||||||
|
const Type *returnType;
|
||||||
|
|
||||||
|
/** Value of the program mask when the function starts execution. */
|
||||||
|
llvm::Value *entryMask;
|
||||||
|
|
||||||
/** If currently in a loop body, the value of the mask at the start of
|
/** If currently in a loop body, the value of the mask at the start of
|
||||||
the loop. */
|
the loop. */
|
||||||
llvm::Value *loopMask;
|
llvm::Value *loopMask;
|
||||||
@@ -538,23 +491,19 @@ private:
|
|||||||
llvm::Value *launchGroupHandlePtr;
|
llvm::Value *launchGroupHandlePtr;
|
||||||
|
|
||||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||||
static void addGSMetadata(llvm::Value *inst, SourcePos pos);
|
static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
|
||||||
bool ifsInLoopAllUniform() const;
|
bool ifsInLoopAllUniform() const;
|
||||||
void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
|
void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
|
||||||
llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
|
llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
|
||||||
|
|
||||||
llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
|
|
||||||
const Type *ptrType);
|
|
||||||
|
|
||||||
void restoreMaskGivenReturns(llvm::Value *oldMask);
|
void restoreMaskGivenReturns(llvm::Value *oldMask);
|
||||||
|
|
||||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
void scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||||
llvm::Value *mask);
|
llvm::Value *maskPtr, const Type *rvalueType);
|
||||||
void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
llvm::Value *gather(llvm::Value *lvalue, const Type *type,
|
||||||
llvm::Value *mask);
|
|
||||||
llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
|
|
||||||
const char *name);
|
const char *name);
|
||||||
llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
|
void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||||
|
const Type *rvalueType, llvm::Value *maskPtr);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // ISPC_CTX_H
|
#endif // ISPC_CTX_H
|
||||||
|
|||||||
583
decl.cpp
583
decl.cpp
@@ -38,59 +38,10 @@
|
|||||||
|
|
||||||
#include "decl.h"
|
#include "decl.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
#include "module.h"
|
|
||||||
#include "sym.h"
|
#include "sym.h"
|
||||||
#include "type.h"
|
#include "type.h"
|
||||||
#include "stmt.h"
|
|
||||||
#include "expr.h"
|
#include "expr.h"
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <set>
|
|
||||||
|
|
||||||
/** Given a Type and a set of type qualifiers, apply the type qualifiers to
|
|
||||||
the type, returning the type that is the result.
|
|
||||||
*/
|
|
||||||
static const Type *
|
|
||||||
lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
|
||||||
if (type == NULL)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
|
|
||||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
|
|
||||||
Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
|
|
||||||
"qualifiers.");
|
|
||||||
|
|
||||||
const Type *unsignedType = type->GetAsUnsignedType();
|
|
||||||
if (unsignedType != NULL)
|
|
||||||
type = unsignedType;
|
|
||||||
else
|
|
||||||
Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
|
||||||
type->GetString().c_str());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
|
|
||||||
Error(pos, "\"signed\" qualifier is illegal with non-integer type "
|
|
||||||
"\"%s\".", type->GetString().c_str());
|
|
||||||
|
|
||||||
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
|
|
||||||
type = type->GetAsConstType();
|
|
||||||
|
|
||||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
|
|
||||||
type = type->GetAsUniformType();
|
|
||||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
|
|
||||||
type = type->GetAsVaryingType();
|
|
||||||
else {
|
|
||||||
// otherwise, structs are uniform by default and everything
|
|
||||||
// else is varying by default
|
|
||||||
if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
|
|
||||||
type = type->GetAsUniformType();
|
|
||||||
else
|
|
||||||
type = type->GetAsVaryingType();
|
|
||||||
}
|
|
||||||
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// DeclSpecs
|
// DeclSpecs
|
||||||
@@ -98,57 +49,29 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
|||||||
DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
|
DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
|
||||||
baseType = t;
|
baseType = t;
|
||||||
storageClass = sc;
|
storageClass = sc;
|
||||||
typeQualifiers = tq;
|
typeQualifier = tq;
|
||||||
soaWidth = 0;
|
soaWidth = 0;
|
||||||
vectorSize = 0;
|
vectorSize = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const Type *
|
|
||||||
DeclSpecs::GetBaseType(SourcePos pos) const {
|
|
||||||
const Type *bt = baseType;
|
|
||||||
if (vectorSize > 0) {
|
|
||||||
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
|
|
||||||
if (atomicType == NULL) {
|
|
||||||
Error(pos, "Only atomic types (int, float, ...) are legal for vector "
|
|
||||||
"types.");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
bt = new VectorType(atomicType, vectorSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
return lApplyTypeQualifiers(typeQualifiers, bt, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static const char *
|
|
||||||
lGetStorageClassName(StorageClass storageClass) {
|
|
||||||
switch (storageClass) {
|
|
||||||
case SC_NONE: return "";
|
|
||||||
case SC_EXTERN: return "extern";
|
|
||||||
case SC_EXTERN_C: return "extern \"C\"";
|
|
||||||
case SC_EXPORT: return "export";
|
|
||||||
case SC_STATIC: return "static";
|
|
||||||
case SC_TYPEDEF: return "typedef";
|
|
||||||
default: FATAL("Unhandled storage class in lGetStorageClassName");
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
DeclSpecs::Print() const {
|
DeclSpecs::Print() const {
|
||||||
printf("%s ", lGetStorageClassName(storageClass));
|
if (storageClass == SC_EXTERN) printf("extern ");
|
||||||
|
if (storageClass == SC_EXTERN_C) printf("extern \"C\" ");
|
||||||
|
if (storageClass == SC_EXPORT) printf("export ");
|
||||||
|
if (storageClass == SC_STATIC) printf("static ");
|
||||||
|
if (storageClass == SC_TYPEDEF) printf("typedef ");
|
||||||
|
|
||||||
if (soaWidth > 0) printf("soa<%d> ", soaWidth);
|
if (soaWidth > 0) printf("soa<%d> ", soaWidth);
|
||||||
|
|
||||||
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
|
if (typeQualifier & TYPEQUAL_INLINE) printf("inline ");
|
||||||
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
|
if (typeQualifier & TYPEQUAL_CONST) printf("const ");
|
||||||
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
|
if (typeQualifier & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||||
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
|
if (typeQualifier & TYPEQUAL_VARYING) printf("varying ");
|
||||||
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
|
if (typeQualifier & TYPEQUAL_TASK) printf("task ");
|
||||||
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
|
if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference ");
|
||||||
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
if (typeQualifier & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||||
|
|
||||||
printf("%s", baseType->GetString().c_str());
|
printf("%s", baseType->GetString().c_str());
|
||||||
|
|
||||||
@@ -159,46 +82,34 @@ DeclSpecs::Print() const {
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Declarator
|
// Declarator
|
||||||
|
|
||||||
Declarator::Declarator(DeclaratorKind dk, SourcePos p)
|
Declarator::Declarator(Symbol *s, SourcePos p)
|
||||||
: pos(p), kind(dk) {
|
: pos(p) {
|
||||||
child = NULL;
|
sym = s;
|
||||||
typeQualifiers = 0;
|
functionArgs = NULL;
|
||||||
arraySize = -1;
|
isFunction = false;
|
||||||
sym = NULL;
|
|
||||||
initExpr = NULL;
|
initExpr = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
|
Declarator::AddArrayDimension(int size) {
|
||||||
const Type *t = GetType(ds);
|
assert(size > 0 || size == -1); // -1 -> unsized
|
||||||
Symbol *sym = GetSymbol();
|
arraySize.push_back(size);
|
||||||
if (sym != NULL) {
|
|
||||||
sym->type = t;
|
|
||||||
sym->storageClass = ds->storageClass;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Symbol *
|
void
|
||||||
Declarator::GetSymbol() const {
|
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
|
||||||
// The symbol lives at the last child in the chain, so walk down there
|
sym->type = GetType(ds);
|
||||||
// and return the one there.
|
|
||||||
const Declarator *d = this;
|
if (ds->storageClass == SC_STATIC)
|
||||||
while (d->child != NULL)
|
sym->isStatic = true;
|
||||||
d = d->child;
|
|
||||||
return d->sym;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
Declarator::Print() const {
|
Declarator::Print() const {
|
||||||
Symbol *sym = GetSymbol();
|
printf("%s", sym->name.c_str());
|
||||||
if (sym != NULL)
|
|
||||||
printf("%s", sym->name.c_str());
|
|
||||||
else
|
|
||||||
printf("(null symbol)");
|
|
||||||
|
|
||||||
if (initExpr != NULL) {
|
if (initExpr != NULL) {
|
||||||
printf(" = (");
|
printf(" = (");
|
||||||
initExpr->Print();
|
initExpr->Print();
|
||||||
@@ -208,305 +119,188 @@ Declarator::Print() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Symbol *
|
static const Type *
|
||||||
Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
|
lGetType(const Declarator *decl, DeclSpecs *ds,
|
||||||
const FunctionType *type =
|
std::vector<int>::const_iterator arrayIter) {
|
||||||
dynamic_cast<const FunctionType *>(GetType(ds));
|
if (arrayIter == decl->arraySize.end()) {
|
||||||
if (type == NULL)
|
// If we don't have an array (or have processed all of the array
|
||||||
return NULL;
|
// dimensions in previous recursive calls), we can go ahead and
|
||||||
|
// figure out the final non-array type we have here.
|
||||||
Symbol *declSym = GetSymbol();
|
const Type *type = ds->baseType;
|
||||||
assert(declSym != NULL);
|
if (type == NULL) {
|
||||||
|
Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".",
|
||||||
// Get the symbol for the function from the symbol table. (It should
|
decl->sym->name.c_str());
|
||||||
// already have been added to the symbol table by AddGlobal() by the
|
|
||||||
// time we get here.)
|
|
||||||
Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
|
|
||||||
if (funSym != NULL)
|
|
||||||
// May be NULL due to error earlier in compilation
|
|
||||||
funSym->pos = pos;
|
|
||||||
|
|
||||||
// Walk down to the declarator for the function. (We have to get past
|
|
||||||
// the stuff that specifies the function's return type before we get to
|
|
||||||
// the function's declarator.)
|
|
||||||
Declarator *d = this;
|
|
||||||
while (d != NULL && d->kind != DK_FUNCTION)
|
|
||||||
d = d->child;
|
|
||||||
assert(d != NULL);
|
|
||||||
|
|
||||||
for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
|
|
||||||
Declaration *pdecl = d->functionParams[i];
|
|
||||||
assert(pdecl->declarators.size() == 1);
|
|
||||||
funArgs->push_back(pdecl->declarators[0]->GetSymbol());
|
|
||||||
}
|
|
||||||
|
|
||||||
return funSym;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
const Type *
|
|
||||||
Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
|
||||||
bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
|
|
||||||
bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
|
|
||||||
bool isTask = ((typeQualifiers & TYPEQUAL_TASK) != 0);
|
|
||||||
bool isConst = ((typeQualifiers & TYPEQUAL_CONST) != 0);
|
|
||||||
|
|
||||||
if (hasUniformQual && hasVaryingQual) {
|
|
||||||
Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
if (kind != DK_FUNCTION && isTask)
|
|
||||||
Error(pos, "\"task\" qualifier illegal in variable declaration.");
|
|
||||||
|
|
||||||
const Type *type = base;
|
|
||||||
switch (kind) {
|
|
||||||
case DK_BASE:
|
|
||||||
// All of the type qualifiers should be in the DeclSpecs for the
|
|
||||||
// base declarator
|
|
||||||
assert(typeQualifiers == 0);
|
|
||||||
assert(child == NULL);
|
|
||||||
return type;
|
|
||||||
|
|
||||||
case DK_POINTER:
|
|
||||||
type = new PointerType(type, hasUniformQual, isConst);
|
|
||||||
if (child != NULL)
|
|
||||||
return child->GetType(type, ds);
|
|
||||||
else
|
|
||||||
return type;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case DK_REFERENCE:
|
|
||||||
if (hasUniformQual)
|
|
||||||
Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
|
|
||||||
if (hasVaryingQual)
|
|
||||||
Error(pos, "\"varying\" qualifier is illegal to apply to references.");
|
|
||||||
if (isConst)
|
|
||||||
Error(pos, "\"const\" qualifier is to illegal apply to references.");
|
|
||||||
|
|
||||||
// The parser should disallow this already, but double check.
|
|
||||||
if (dynamic_cast<const ReferenceType *>(type) != NULL) {
|
|
||||||
Error(pos, "References to references are illegal.");
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
type = new ReferenceType(type);
|
// Account for 'unsigned' and 'const' qualifiers in the type
|
||||||
if (child != NULL)
|
if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) {
|
||||||
return child->GetType(type, ds);
|
const Type *unsignedType = type->GetAsUnsignedType();
|
||||||
else
|
if (unsignedType != NULL)
|
||||||
return type;
|
type = unsignedType;
|
||||||
break;
|
else
|
||||||
|
Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
||||||
|
type->GetString().c_str());
|
||||||
|
}
|
||||||
|
if ((ds->typeQualifier & TYPEQUAL_CONST) != 0)
|
||||||
|
type = type->GetAsConstType();
|
||||||
|
|
||||||
case DK_ARRAY:
|
if (ds->vectorSize > 0) {
|
||||||
type = new ArrayType(type, arraySize);
|
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
|
||||||
if (child)
|
if (atomicType == NULL) {
|
||||||
return child->GetType(type, ds);
|
Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector "
|
||||||
else
|
"types.");
|
||||||
return type;
|
return NULL;
|
||||||
break;
|
|
||||||
|
|
||||||
case DK_FUNCTION: {
|
|
||||||
std::vector<const Type *> args;
|
|
||||||
std::vector<std::string> argNames;
|
|
||||||
std::vector<ConstExpr *> argDefaults;
|
|
||||||
std::vector<SourcePos> argPos;
|
|
||||||
|
|
||||||
// Loop over the function arguments and store the names, types,
|
|
||||||
// default values (if any), and source file positions each one in
|
|
||||||
// the corresponding vector.
|
|
||||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
|
||||||
Declaration *d = functionParams[i];
|
|
||||||
|
|
||||||
char buf[32];
|
|
||||||
Symbol *sym;
|
|
||||||
if (d->declarators.size() == 0) {
|
|
||||||
// function declaration like foo(float), w/o a name for
|
|
||||||
// the parameter
|
|
||||||
sprintf(buf, "__anon_parameter_%d", i);
|
|
||||||
sym = new Symbol(buf, pos);
|
|
||||||
sym->type = d->declSpecs->GetBaseType(pos);
|
|
||||||
}
|
}
|
||||||
else {
|
type = new VectorType(atomicType, ds->vectorSize);
|
||||||
sym = d->declarators[0]->GetSymbol();
|
|
||||||
if (sym == NULL) {
|
|
||||||
// Handle more complex anonymous declarations like
|
|
||||||
// float (float **).
|
|
||||||
sprintf(buf, "__anon_parameter_%d", i);
|
|
||||||
sym = new Symbol(buf, d->declarators[0]->pos);
|
|
||||||
sym->type = d->declarators[0]->GetType(d->declSpecs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (d->declSpecs->storageClass != SC_NONE)
|
|
||||||
Error(sym->pos, "Storage class \"%s\" is illegal in "
|
|
||||||
"function parameter declaration for parameter \"%s\".",
|
|
||||||
lGetStorageClassName(d->declSpecs->storageClass),
|
|
||||||
sym->name.c_str());
|
|
||||||
|
|
||||||
const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
|
|
||||||
if (at != NULL) {
|
|
||||||
// As in C, arrays are passed to functions as pointers to
|
|
||||||
// their element type. We'll just immediately make this
|
|
||||||
// change now. (One shortcoming of losing the fact that
|
|
||||||
// the it was originally an array is that any warnings or
|
|
||||||
// errors later issued that print the function type will
|
|
||||||
// report this differently than it was originally declared
|
|
||||||
// in the function, but it's not clear that this is a
|
|
||||||
// significant problem.)
|
|
||||||
sym->type = PointerType::GetUniform(at->GetElementType());
|
|
||||||
|
|
||||||
// Make sure there are no unsized arrays (other than the
|
|
||||||
// first dimension) in function parameter lists.
|
|
||||||
at = dynamic_cast<const ArrayType *>(at->GetElementType());
|
|
||||||
while (at != NULL) {
|
|
||||||
if (at->GetElementCount() == 0)
|
|
||||||
Error(sym->pos, "Arrays with unsized dimensions in "
|
|
||||||
"dimensions after the first one are illegal in "
|
|
||||||
"function parameter lists.");
|
|
||||||
at = dynamic_cast<const ArrayType *>(at->GetElementType());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
args.push_back(sym->type);
|
|
||||||
argNames.push_back(sym->name);
|
|
||||||
argPos.push_back(sym->pos);
|
|
||||||
|
|
||||||
ConstExpr *init = NULL;
|
|
||||||
if (d->declarators.size()) {
|
|
||||||
// Try to find an initializer expression; if there is one,
|
|
||||||
// it lives down to the base declarator.
|
|
||||||
Declarator *decl = d->declarators[0];
|
|
||||||
while (decl->child != NULL) {
|
|
||||||
assert(decl->initExpr == NULL);
|
|
||||||
decl = decl->child;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (decl->initExpr != NULL &&
|
|
||||||
(decl->initExpr = decl->initExpr->TypeCheck()) != NULL &&
|
|
||||||
(decl->initExpr = decl->initExpr->Optimize()) != NULL &&
|
|
||||||
(init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
|
|
||||||
Error(decl->initExpr->pos, "Default value for parameter "
|
|
||||||
"\"%s\" must be a compile-time constant.",
|
|
||||||
sym->name.c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
argDefaults.push_back(init);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const Type *returnType = type;
|
// if uniform/varying is specified explicitly, then go with that
|
||||||
if (returnType == NULL) {
|
if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0)
|
||||||
Error(pos, "No return type provided in function declaration.");
|
return type->GetAsUniformType();
|
||||||
return NULL;
|
else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0)
|
||||||
|
return type->GetAsVaryingType();
|
||||||
|
else {
|
||||||
|
// otherwise, structs are uniform by default and everything
|
||||||
|
// else is varying by default
|
||||||
|
if (dynamic_cast<const StructType *>(type) != NULL)
|
||||||
|
return type->GetAsUniformType();
|
||||||
|
else
|
||||||
|
return type->GetAsVaryingType();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isExported = ds && (ds->storageClass == SC_EXPORT);
|
|
||||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
|
||||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
|
||||||
|
|
||||||
if (isExported && isTask) {
|
|
||||||
Error(pos, "Function can't have both \"task\" and \"export\" "
|
|
||||||
"qualifiers");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
if (isExternC && isTask) {
|
|
||||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
|
|
||||||
"qualifiers");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
if (isExternC && isExported) {
|
|
||||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
|
|
||||||
"qualifiers");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
Type *functionType =
|
|
||||||
new FunctionType(returnType, args, pos, argNames, argDefaults,
|
|
||||||
argPos, isTask, isExported, isExternC);
|
|
||||||
return child->GetType(functionType, ds);
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
FATAL("Unexpected decl kind");
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
// Peel off one dimension of the array
|
||||||
|
int arraySize = *arrayIter;
|
||||||
|
++arrayIter;
|
||||||
|
|
||||||
#if 0
|
// Get the type, not including the arraySize dimension peeled off
|
||||||
|
// above.
|
||||||
|
const Type *childType = lGetType(decl, ds, arrayIter);
|
||||||
|
|
||||||
|
int soaWidth = ds->soaWidth;
|
||||||
|
if (soaWidth == 0)
|
||||||
|
// If there's no "soa<n>" stuff going on, just return a regular
|
||||||
|
// array with the appropriate size
|
||||||
|
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
||||||
|
else {
|
||||||
// Make sure we actually have an array of structs ..
|
// Make sure we actually have an array of structs ..
|
||||||
const StructType *childStructType =
|
const StructType *childStructType =
|
||||||
dynamic_cast<const StructType *>(childType);
|
dynamic_cast<const StructType *>(childType);
|
||||||
if (childStructType == NULL) {
|
if (childStructType == NULL) {
|
||||||
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||||
"type \"%s\".", soaWidth, childType->GetString().c_str());
|
"type \"%s\".", soaWidth, childType->GetString().c_str());
|
||||||
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
||||||
}
|
}
|
||||||
else if ((soaWidth & (soaWidth - 1)) != 0) {
|
else if ((soaWidth & (soaWidth - 1)) != 0) {
|
||||||
Error(pos, "soa<%d> width illegal. Value must be power of two.",
|
Error(decl->pos, "soa<%d> width illegal. Value must be power of two.",
|
||||||
soaWidth);
|
soaWidth);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
|
else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
|
||||||
Error(pos, "soa<%d> width must evenly divide array size %d.",
|
Error(decl->pos, "soa<%d> width must evenly divide array size %d.",
|
||||||
soaWidth, arraySize);
|
soaWidth, arraySize);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
|
return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
|
||||||
soaWidth);
|
soaWidth);
|
||||||
#endif
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const Type *
|
const Type *
|
||||||
Declarator::GetType(DeclSpecs *ds) const {
|
Declarator::GetType(DeclSpecs *ds) const {
|
||||||
const Type *baseType = ds->GetBaseType(pos);
|
bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0);
|
||||||
const Type *type = GetType(baseType, ds);
|
bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0);
|
||||||
return type;
|
bool isTask = ((ds->typeQualifier & TYPEQUAL_TASK) != 0);
|
||||||
}
|
bool isReference = ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0);
|
||||||
|
|
||||||
|
if (hasUniformQual && hasVaryingQual) {
|
||||||
|
Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isFunction) {
|
||||||
|
std::vector<const Type *> args;
|
||||||
|
std::vector<std::string> argNames;
|
||||||
|
if (functionArgs) {
|
||||||
|
// Loop over the function arguments and get names and types for
|
||||||
|
// each one in the args and argNames arrays
|
||||||
|
for (unsigned int i = 0; i < functionArgs->size(); ++i) {
|
||||||
|
Declaration *d = (*functionArgs)[i];
|
||||||
|
Symbol *sym;
|
||||||
|
if (d->declarators.size() == 0) {
|
||||||
|
// function declaration like foo(float), w/o a name for
|
||||||
|
// the parameter
|
||||||
|
char buf[32];
|
||||||
|
sprintf(buf, "__anon_parameter_%d", i);
|
||||||
|
sym = new Symbol(buf, pos);
|
||||||
|
Declarator *declarator = new Declarator(sym, sym->pos);
|
||||||
|
sym->type = declarator->GetType(d->declSpecs);
|
||||||
|
d->declarators.push_back(declarator);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(d->declarators.size() == 1);
|
||||||
|
sym = d->declarators[0]->sym;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Arrays are passed by reference, so convert array
|
||||||
|
// parameters to be references here.
|
||||||
|
if (dynamic_cast<const ArrayType *>(sym->type) != NULL)
|
||||||
|
sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
|
||||||
|
|
||||||
|
args.push_back(sym->type);
|
||||||
|
argNames.push_back(sym->name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ds->baseType == NULL) {
|
||||||
|
Warning(pos, "No return type provided in declaration of function \"%s\". "
|
||||||
|
"Treating as \"void\".", sym->name.c_str());
|
||||||
|
ds->baseType = AtomicType::Void;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isReference) {
|
||||||
|
Error(pos, "Function return types can't be reference types.");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Type *returnType = lGetType(this, ds, arraySize.begin());
|
||||||
|
if (returnType == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
bool isExported = (ds->storageClass == SC_EXPORT);
|
||||||
|
bool isExternC = (ds->storageClass == SC_EXTERN_C);
|
||||||
|
return new FunctionType(returnType, args, pos, &argNames, isTask,
|
||||||
|
isExported, isExternC);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (isTask)
|
||||||
|
Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".",
|
||||||
|
sym->name.c_str());
|
||||||
|
|
||||||
|
const Type *type = lGetType(this, ds, arraySize.begin());
|
||||||
|
|
||||||
|
if (type != NULL && isReference) {
|
||||||
|
bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0);
|
||||||
|
type = new ReferenceType(type, hasConstQual);
|
||||||
|
}
|
||||||
|
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Declaration
|
// Declaration
|
||||||
|
|
||||||
Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
|
void
|
||||||
declSpecs = ds;
|
Declaration::AddSymbols(SymbolTable *st) const {
|
||||||
if (dlist != NULL)
|
|
||||||
declarators = *dlist;
|
|
||||||
for (unsigned int i = 0; i < declarators.size(); ++i)
|
|
||||||
if (declarators[i] != NULL)
|
|
||||||
declarators[i]->InitFromDeclSpecs(declSpecs);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
|
|
||||||
declSpecs = ds;
|
|
||||||
if (d != NULL) {
|
|
||||||
d->InitFromDeclSpecs(ds);
|
|
||||||
declarators.push_back(d);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::vector<VariableDeclaration>
|
|
||||||
Declaration::GetVariableDeclarations() const {
|
|
||||||
assert(declSpecs->storageClass != SC_TYPEDEF);
|
assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||||
std::vector<VariableDeclaration> vars;
|
|
||||||
|
|
||||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
for (unsigned int i = 0; i < declarators.size(); ++i)
|
||||||
if (declarators[i] == NULL)
|
if (declarators[i])
|
||||||
continue;
|
st->AddVariable(declarators[i]->sym);
|
||||||
Declarator *decl = declarators[i];
|
|
||||||
if (decl == NULL)
|
|
||||||
// Ignore earlier errors
|
|
||||||
continue;
|
|
||||||
|
|
||||||
Symbol *sym = decl->GetSymbol();
|
|
||||||
if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
|
|
||||||
// function declaration
|
|
||||||
m->symbolTable->AddFunction(sym);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
m->symbolTable->AddVariable(sym);
|
|
||||||
vars.push_back(VariableDeclaration(sym, decl->initExpr));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return vars;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -528,44 +322,29 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
|||||||
std::vector<const Type *> *elementTypes,
|
std::vector<const Type *> *elementTypes,
|
||||||
std::vector<std::string> *elementNames,
|
std::vector<std::string> *elementNames,
|
||||||
std::vector<SourcePos> *elementPositions) {
|
std::vector<SourcePos> *elementPositions) {
|
||||||
std::set<std::string> seenNames;
|
|
||||||
for (unsigned int i = 0; i < sd.size(); ++i) {
|
for (unsigned int i = 0; i < sd.size(); ++i) {
|
||||||
const Type *type = sd[i]->type;
|
const Type *type = sd[i]->type;
|
||||||
if (type == NULL)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// FIXME: making this fake little DeclSpecs here is really
|
// FIXME: making this fake little DeclSpecs here is really
|
||||||
// disgusting
|
// disgusting
|
||||||
DeclSpecs ds(type);
|
DeclSpecs ds(type);
|
||||||
if (type->IsUniformType())
|
if (type->IsUniformType())
|
||||||
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
|
ds.typeQualifier |= TYPEQUAL_UNIFORM;
|
||||||
else
|
else
|
||||||
ds.typeQualifiers |= TYPEQUAL_VARYING;
|
ds.typeQualifier |= TYPEQUAL_VARYING;
|
||||||
|
|
||||||
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
|
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
|
||||||
Declarator *d = (*sd[i]->declarators)[j];
|
Declarator *d = (*sd[i]->declarators)[j];
|
||||||
d->InitFromDeclSpecs(&ds);
|
d->InitFromDeclSpecs(&ds);
|
||||||
|
|
||||||
Symbol *sym = d->GetSymbol();
|
// if it's an unsized array, make it a reference to an unsized
|
||||||
|
// array, so the caller can pass a pointer...
|
||||||
|
const ArrayType *at = dynamic_cast<const ArrayType *>(d->sym->type);
|
||||||
|
if (at && at->GetElementCount() == 0)
|
||||||
|
d->sym->type = new ReferenceType(d->sym->type, type->IsConstType());
|
||||||
|
|
||||||
const ArrayType *arrayType =
|
elementTypes->push_back(d->sym->type);
|
||||||
dynamic_cast<const ArrayType *>(sym->type);
|
elementNames->push_back(d->sym->name);
|
||||||
if (arrayType != NULL && arrayType->GetElementCount() == 0) {
|
elementPositions->push_back(d->sym->pos);
|
||||||
Error(d->pos, "Unsized arrays aren't allowed in struct "
|
|
||||||
"definitions.");
|
|
||||||
elementTypes->push_back(NULL);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
elementTypes->push_back(sym->type);
|
|
||||||
|
|
||||||
if (seenNames.find(sym->name) != seenNames.end())
|
|
||||||
Error(d->pos, "Struct member \"%s\" has same name as a "
|
|
||||||
"previously-declared member.", sym->name.c_str());
|
|
||||||
else
|
|
||||||
seenNames.insert(sym->name);
|
|
||||||
|
|
||||||
elementNames->push_back(sym->name);
|
|
||||||
elementPositions->push_back(sym->pos);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
100
decl.h
100
decl.h
@@ -56,11 +56,6 @@
|
|||||||
|
|
||||||
#include "ispc.h"
|
#include "ispc.h"
|
||||||
|
|
||||||
struct VariableDeclaration;
|
|
||||||
|
|
||||||
class Declaration;
|
|
||||||
class Declarator;
|
|
||||||
|
|
||||||
enum StorageClass {
|
enum StorageClass {
|
||||||
SC_NONE,
|
SC_NONE,
|
||||||
SC_EXTERN,
|
SC_EXTERN,
|
||||||
@@ -79,7 +74,7 @@ enum StorageClass {
|
|||||||
#define TYPEQUAL_UNIFORM (1<<1)
|
#define TYPEQUAL_UNIFORM (1<<1)
|
||||||
#define TYPEQUAL_VARYING (1<<2)
|
#define TYPEQUAL_VARYING (1<<2)
|
||||||
#define TYPEQUAL_TASK (1<<3)
|
#define TYPEQUAL_TASK (1<<3)
|
||||||
#define TYPEQUAL_SIGNED (1<<4)
|
#define TYPEQUAL_REFERENCE (1<<4)
|
||||||
#define TYPEQUAL_UNSIGNED (1<<5)
|
#define TYPEQUAL_UNSIGNED (1<<5)
|
||||||
#define TYPEQUAL_INLINE (1<<6)
|
#define TYPEQUAL_INLINE (1<<6)
|
||||||
|
|
||||||
@@ -97,17 +92,15 @@ public:
|
|||||||
StorageClass storageClass;
|
StorageClass storageClass;
|
||||||
|
|
||||||
/** Zero or more of the TYPEQUAL_* values, ANDed together. */
|
/** Zero or more of the TYPEQUAL_* values, ANDed together. */
|
||||||
int typeQualifiers;
|
int typeQualifier;
|
||||||
|
|
||||||
/** The basic type provided in the declaration; this should be an
|
/** The basic type provided in the declaration; this should be an
|
||||||
AtomicType, EnumType, StructType, or VectorType; other types (like
|
AtomicType, a StructType, or a VectorType; other types (like
|
||||||
ArrayTypes) will end up being created if a particular declaration
|
ArrayTypes) will end up being created if a particular declaration
|
||||||
has an array size, etc.
|
has an array size, etc.
|
||||||
*/
|
*/
|
||||||
const Type *baseType;
|
const Type *baseType;
|
||||||
|
|
||||||
const Type *GetBaseType(SourcePos pos) const;
|
|
||||||
|
|
||||||
/** If this is a declaration with a vector type, this gives the vector
|
/** If this is a declaration with a vector type, this gives the vector
|
||||||
width. For non-vector types, this is zero.
|
width. For non-vector types, this is zero.
|
||||||
*/
|
*/
|
||||||
@@ -120,14 +113,6 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
enum DeclaratorKind {
|
|
||||||
DK_BASE,
|
|
||||||
DK_POINTER,
|
|
||||||
DK_REFERENCE,
|
|
||||||
DK_ARRAY,
|
|
||||||
DK_FUNCTION
|
|
||||||
};
|
|
||||||
|
|
||||||
/** @brief Representation of the declaration of a single variable.
|
/** @brief Representation of the declaration of a single variable.
|
||||||
|
|
||||||
In conjunction with an instance of the DeclSpecs, this gives us
|
In conjunction with an instance of the DeclSpecs, this gives us
|
||||||
@@ -135,7 +120,13 @@ enum DeclaratorKind {
|
|||||||
*/
|
*/
|
||||||
class Declarator {
|
class Declarator {
|
||||||
public:
|
public:
|
||||||
Declarator(DeclaratorKind dk, SourcePos p);
|
Declarator(Symbol *s, SourcePos p);
|
||||||
|
|
||||||
|
/** As the parser peels off array dimension declarations after the
|
||||||
|
symbol name, it calls this method to provide them to the
|
||||||
|
Declarator.
|
||||||
|
*/
|
||||||
|
void AddArrayDimension(int size);
|
||||||
|
|
||||||
/** Once a DeclSpecs instance is available, this method completes the
|
/** Once a DeclSpecs instance is available, this method completes the
|
||||||
initialization of the Symbol, setting its Type accordingly.
|
initialization of the Symbol, setting its Type accordingly.
|
||||||
@@ -143,51 +134,21 @@ public:
|
|||||||
void InitFromDeclSpecs(DeclSpecs *ds);
|
void InitFromDeclSpecs(DeclSpecs *ds);
|
||||||
|
|
||||||
/** Get the actual type of the combination of Declarator and the given
|
/** Get the actual type of the combination of Declarator and the given
|
||||||
DeclSpecs. If an explicit base type is provided, the declarator is
|
DeclSpecs */
|
||||||
applied to that type; otherwise the base type from the DeclSpecs is
|
|
||||||
used. */
|
|
||||||
const Type *GetType(DeclSpecs *ds) const;
|
const Type *GetType(DeclSpecs *ds) const;
|
||||||
const Type *GetType(const Type *base, DeclSpecs *ds) const;
|
|
||||||
|
|
||||||
/** Returns the symbol corresponding to the function declared by this
|
|
||||||
declarator and symbols for its arguments in *args. */
|
|
||||||
Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
|
|
||||||
|
|
||||||
/** Returns the symbol associated with the declarator. */
|
|
||||||
Symbol *GetSymbol() const;
|
|
||||||
|
|
||||||
void Print() const;
|
void Print() const;
|
||||||
|
|
||||||
/** Position of the declarator in the source program. */
|
|
||||||
const SourcePos pos;
|
const SourcePos pos;
|
||||||
|
|
||||||
/** The kind of this declarator; complex declarations are assembled as
|
|
||||||
a hierarchy of Declarators. (For example, a pointer to an int
|
|
||||||
would have a root declarator with kind DK_POINTER and with the
|
|
||||||
Declarator::child member pointing to a DK_BASE declarator for the
|
|
||||||
int). */
|
|
||||||
const DeclaratorKind kind;
|
|
||||||
|
|
||||||
/** Child pointer if needed; this can only be non-NULL if the
|
|
||||||
declarator's kind isn't DK_BASE. */
|
|
||||||
Declarator *child;
|
|
||||||
|
|
||||||
/** Type qualifiers provided with the declarator. */
|
|
||||||
int typeQualifiers;
|
|
||||||
|
|
||||||
/** For array declarators, this gives the declared size of the array.
|
|
||||||
Unsized arrays have arraySize == 0. */
|
|
||||||
int arraySize;
|
|
||||||
|
|
||||||
/** Symbol associated with the declarator. */
|
|
||||||
Symbol *sym;
|
Symbol *sym;
|
||||||
|
/** If this declarator includes an array specification, the sizes of
|
||||||
|
the array dimensions are represented here.
|
||||||
|
*/
|
||||||
|
std::vector<int> arraySize;
|
||||||
/** Initialization expression for the variable. May be NULL. */
|
/** Initialization expression for the variable. May be NULL. */
|
||||||
Expr *initExpr;
|
Expr *initExpr;
|
||||||
|
bool isFunction;
|
||||||
/** For function declarations, this holds the Declaration *s for the
|
std::vector<Declaration *> *functionArgs;
|
||||||
funciton's parameters. */
|
|
||||||
std::vector<Declaration *> functionParams;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -196,18 +157,27 @@ public:
|
|||||||
*/
|
*/
|
||||||
class Declaration {
|
class Declaration {
|
||||||
public:
|
public:
|
||||||
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
|
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL) {
|
||||||
Declaration(DeclSpecs *ds, Declarator *d);
|
declSpecs = ds;
|
||||||
|
if (dlist != NULL)
|
||||||
|
declarators = *dlist;
|
||||||
|
for (unsigned int i = 0; i < declarators.size(); ++i)
|
||||||
|
if (declarators[i] != NULL)
|
||||||
|
declarators[i]->InitFromDeclSpecs(declSpecs);
|
||||||
|
}
|
||||||
|
Declaration(DeclSpecs *ds, Declarator *d) {
|
||||||
|
declSpecs = ds;
|
||||||
|
if (d) {
|
||||||
|
d->InitFromDeclSpecs(ds);
|
||||||
|
declarators.push_back(d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Adds the symbols for the variables in the declaration to the symbol
|
||||||
|
table. */
|
||||||
|
void AddSymbols(SymbolTable *st) const;
|
||||||
void Print() const;
|
void Print() const;
|
||||||
|
|
||||||
/** This method walks through all of the Declarators in a declaration
|
|
||||||
and returns a fully-initialized Symbol and (possibly) and
|
|
||||||
initialization expression for each one. (This allows the rest of
|
|
||||||
the system to not have to worry about the mess of the general
|
|
||||||
Declarator representation.) */
|
|
||||||
std::vector<VariableDeclaration> GetVariableDeclarations() const;
|
|
||||||
|
|
||||||
DeclSpecs *declSpecs;
|
DeclSpecs *declSpecs;
|
||||||
std::vector<Declarator *> declarators;
|
std::vector<Declarator *> declarators;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,85 +1,3 @@
|
|||||||
=== v1.1.0 === (5 December 2011)
|
|
||||||
|
|
||||||
This is a major new release of the compiler, with significant additions to
|
|
||||||
language functionality and capabilities. It includes a number of small
|
|
||||||
language syntax changes that will require modification of existing
|
|
||||||
programs. These changes should generally be straightforward and all are
|
|
||||||
steps toward eliminating parts of ispc syntax that are incompatible with
|
|
||||||
C/C++. See
|
|
||||||
http://ispc.github.com/ispc.html#updating-ispc-programs-for-changes-in-ispc-1-1
|
|
||||||
for more information about these changes.
|
|
||||||
|
|
||||||
ispc now fully supports pointers, including pointer arithmetic, implicit
|
|
||||||
conversions of arrays to pointers, and all of the other capabilities of
|
|
||||||
pointers in C. See http://ispc.github.com/ispc.html#pointer-types for more
|
|
||||||
information about pointers in ispc and
|
|
||||||
http://ispc.github.com/ispc.html#function-pointer-types for information
|
|
||||||
about function pointers in ispc.
|
|
||||||
|
|
||||||
Reference types are now declared with C++ syntax (e.g. "const float &foo").
|
|
||||||
|
|
||||||
ispc now supports 64-bit addressing. For performance reasons, this
|
|
||||||
capability is disabled by default (even on 64-bit targets), but can be
|
|
||||||
enabled with a command-line flag:
|
|
||||||
http://ispc.github.com/ispc.html#selecting-32-or-64-bit-addressing.
|
|
||||||
|
|
||||||
This release features new parallel "foreach" statements, which make it
|
|
||||||
easier in many instances to map program instances to data for data-parallel
|
|
||||||
computation than the programIndex/programCount mechanism:
|
|
||||||
http://ispc.github.com/ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled.
|
|
||||||
|
|
||||||
Finally, all of the system's documentation has been significantly revised.
|
|
||||||
The documentation of ispc's parallel execution model has been rewritten:
|
|
||||||
http://ispc.github.com/ispc.html#the-ispc-parallel-execution-model, and
|
|
||||||
there is now a more specific discussion of similarities and differences
|
|
||||||
between ispc and C/C++:
|
|
||||||
http://ispc.github.com/ispc.html#relationship-to-the-c-programming-language.
|
|
||||||
There is now a separate FAQ (http://ispc.github.com/faq.html), and a
|
|
||||||
Performance Guide (http://ispc.github.com/perfguide.html).
|
|
||||||
|
|
||||||
=== v1.0.12 === (20 October 2011)
|
|
||||||
|
|
||||||
This release includes a new "double-pumped" 8-wide target for SSE2,
|
|
||||||
"sse2-x2". Like the sse4-x2 and avx-x2 targets, this target may deliver
|
|
||||||
higher performance for some workloads than the regular sse2 target. (For
|
|
||||||
other workloads, it may be slower.)
|
|
||||||
|
|
||||||
The ispc language now includes an "assert()" statement. See
|
|
||||||
http://ispc.github.com/ispc.html#assertions for more information.
|
|
||||||
|
|
||||||
The compiler now sets a preprocessor #define based on the target ISA; for
|
|
||||||
example, ISPC_TARGET_SSE4 is defined for the sse4 targets, and so forth.
|
|
||||||
|
|
||||||
The standard library now provides high-performance routines for converting
|
|
||||||
between some "array of structures" and "structure of arrays" formats.
|
|
||||||
See
|
|
||||||
http://ispc.github.com/ispc.html#converting-between-array-of-structures-and-structure-of-arrays-layout
|
|
||||||
for more information.
|
|
||||||
|
|
||||||
Inline functions now have static linkage.
|
|
||||||
|
|
||||||
A number of improvements have been made to the optimization passes that
|
|
||||||
detect when gathers and scatters can be transformed into vector stores and
|
|
||||||
loads, respectively. In particular, these passes now handle variables that
|
|
||||||
are used as loop induction variables much better.
|
|
||||||
|
|
||||||
=== v1.0.11 === (6 October 2011)
|
|
||||||
|
|
||||||
The main new feature in this release is support for generating code for
|
|
||||||
multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
|
|
||||||
select the best variant at execution time. For more information, see
|
|
||||||
http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
|
|
||||||
|
|
||||||
All of the examples now take advantage of the support for multiple
|
|
||||||
compilation targets; thus, if one has an AVX system, it's not necessary to
|
|
||||||
recompile the examples to use the AVX target.
|
|
||||||
|
|
||||||
Performance of the built-in task system that is used in the examples has
|
|
||||||
been improved.
|
|
||||||
|
|
||||||
Finally, the print() statement now works on OSX; it had been broken for the
|
|
||||||
last few releases.
|
|
||||||
|
|
||||||
=== v1.0.10 === (30 September 2011)
|
=== v1.0.10 === (30 September 2011)
|
||||||
|
|
||||||
This release features an extensive new example showing the application of
|
This release features an extensive new example showing the application of
|
||||||
|
|||||||
@@ -1,12 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
for i in ispc perfguide faq; do
|
rst2html.py ispc.txt > ispc.html
|
||||||
rst2html.py --template=template.txt --link-stylesheet \
|
|
||||||
--stylesheet-path=css/style.css $i.txt > $i.html
|
|
||||||
done
|
|
||||||
|
|
||||||
rst2html.py --template=template-perf.txt --link-stylesheet \
|
|
||||||
--stylesheet-path=css/style.css perf.txt > perf.html
|
|
||||||
|
|
||||||
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
|
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
|
||||||
#pdflatex ispc.tex
|
#pdflatex ispc.tex
|
||||||
|
|||||||
482
docs/faq.txt
482
docs/faq.txt
@@ -1,482 +0,0 @@
|
|||||||
=============================================================
|
|
||||||
Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
|
|
||||||
=============================================================
|
|
||||||
|
|
||||||
This document includes a number of frequently (and not frequently) asked
|
|
||||||
questions about ispc, the Intel® SPMD Program Compiler. The source to this
|
|
||||||
document is in the file ``docs/faq.txt`` in the ``ispc`` source
|
|
||||||
distribution.
|
|
||||||
|
|
||||||
* Understanding ispc's Output
|
|
||||||
|
|
||||||
+ `How can I see the assembly language generated by ispc?`_
|
|
||||||
+ `How can I have the assembly output be printed using Intel assembly syntax?`_
|
|
||||||
+ `Why are there multiple versions of exported ispc functions in the assembly output?`_
|
|
||||||
+ `How can I more easily see gathers and scatters in generated assembly?`_
|
|
||||||
|
|
||||||
* Interoperability
|
|
||||||
|
|
||||||
+ `How can I supply an initial execution mask in the call from the application?`_
|
|
||||||
+ `How can I generate a single binary executable with support for multiple instruction sets?`_
|
|
||||||
+ `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
|
|
||||||
|
|
||||||
* Programming Techniques
|
|
||||||
|
|
||||||
+ `What primitives are there for communicating between SPMD program instances?`_
|
|
||||||
+ `How can a gang of program instances generate variable amounts of output efficiently?`_
|
|
||||||
+ `Is it possible to use ispc for explicit vector programming?`_
|
|
||||||
+ `How can I debug my ispc programs using Valgrind?`_
|
|
||||||
|
|
||||||
Understanding ispc's Output
|
|
||||||
===========================
|
|
||||||
|
|
||||||
How can I see the assembly language generated by ispc?
|
|
||||||
------------------------------------------------------
|
|
||||||
|
|
||||||
The ``--emit-asm`` flag causes assembly output to be generated. If the
|
|
||||||
``-o`` command-line flag is also supplied, the assembly is stored in the
|
|
||||||
given file, or printed to standard output if ``-`` is specified for the
|
|
||||||
filename. For example, given the simple ``ispc`` program:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export uniform int foo(uniform int a, uniform int b) {
|
|
||||||
return a+b;
|
|
||||||
}
|
|
||||||
|
|
||||||
If the SSE4 target is used, then the following assembly is printed:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
_foo:
|
|
||||||
addl %esi, %edi
|
|
||||||
movl %edi, %eax
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
||||||
How can I have the assembly output be printed using Intel assembly syntax?
|
|
||||||
--------------------------------------------------------------------------
|
|
||||||
|
|
||||||
The ``ispc`` compiler is currently only able to emit assembly with AT+T
|
|
||||||
syntax, where the destination operand is the last operand after an
|
|
||||||
instruction. If you'd prefer Intel assembly output, one option is to use
|
|
||||||
Agner Fog's ``objconv`` tool: have ``ispc`` emit a native object file and
|
|
||||||
then use ``objconv`` to disassemble it, specifying the assembler syntax
|
|
||||||
that you prefer. ``objconv`` `is available for download here`_.
|
|
||||||
|
|
||||||
.. _is available for download here: http://www.agner.org/optimize/#objconv
|
|
||||||
|
|
||||||
Why are there multiple versions of exported ispc functions in the assembly output?
|
|
||||||
----------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
Two generations of all functions qualified with ``export`` are generated:
|
|
||||||
one of them is for being be called by other ``ispc`` functions, and the
|
|
||||||
other is to be called by the application. The application callable
|
|
||||||
function has the original function's name, while the ``ispc``-callable
|
|
||||||
function has a mangled name that encodes the types of the function's
|
|
||||||
parameters.
|
|
||||||
|
|
||||||
The crucial difference between these two functions is that the
|
|
||||||
application-callable function doesn't take a parameter encoding the current
|
|
||||||
execution mask, while ``ispc``-callable functions have a hidden mask
|
|
||||||
parameter. An implication of this difference is that the ``export``
|
|
||||||
function starts with the execution mask "all on". This allows a number of
|
|
||||||
improvements in the generated code, particularly on architectures that
|
|
||||||
don't have support for masked load and store instructions.
|
|
||||||
|
|
||||||
As an example, consider this short function, which loads a vector's worth
|
|
||||||
values from two arrays in memory, adds them, and writes the result to an
|
|
||||||
output array.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export void foo(uniform float a[], uniform float b[],
|
|
||||||
uniform float result[]) {
|
|
||||||
float aa = a[programIndex], bb = b[programIndex];
|
|
||||||
result[programIndex] = aa+bb;
|
|
||||||
}
|
|
||||||
|
|
||||||
Here is the assembly code for the application-callable instance of the
|
|
||||||
function.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
_foo:
|
|
||||||
movups (%rsi), %xmm1
|
|
||||||
movups (%rdi), %xmm0
|
|
||||||
addps %xmm1, %xmm0
|
|
||||||
movups %xmm0, (%rdx)
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
||||||
And here is the assembly code for the ``ispc``-callable instance of the
|
|
||||||
function.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
"_foo___uptr<Uf>uptr<Uf>uptr<Uf>":
|
|
||||||
movmskps %xmm0, %eax
|
|
||||||
cmpl $15, %eax
|
|
||||||
je LBB0_3
|
|
||||||
testl %eax, %eax
|
|
||||||
jne LBB0_4
|
|
||||||
ret
|
|
||||||
LBB0_3:
|
|
||||||
movups (%rsi), %xmm1
|
|
||||||
movups (%rdi), %xmm0
|
|
||||||
addps %xmm1, %xmm0
|
|
||||||
movups %xmm0, (%rdx)
|
|
||||||
ret
|
|
||||||
LBB0_4:
|
|
||||||
####
|
|
||||||
#### Code elided; handle mixed mask case..
|
|
||||||
####
|
|
||||||
ret
|
|
||||||
|
|
||||||
There are a few things to notice in this code. First, the current program
|
|
||||||
mask is coming in via the ``%xmm0`` register and the initial few
|
|
||||||
instructions in the function essentially check to see if the mask is all on
|
|
||||||
or all off. If the mask is all on, the code at the label LBB0_3 executes;
|
|
||||||
it's the same as the code that was generated for ``_foo`` above. If the
|
|
||||||
mask is all off, then there's nothing to be done, and the function can
|
|
||||||
return immediately.
|
|
||||||
|
|
||||||
In the case of a mixed mask, a substantial amount of code is generated to
|
|
||||||
load from and then store to only the array elements that correspond to
|
|
||||||
program instances where the mask is on. (This code is elided below). This
|
|
||||||
general pattern of having two-code paths for the "all on" and "mixed" mask
|
|
||||||
cases is used in the code generated for almost all but the most simple
|
|
||||||
functions (where the overhead of the test isn't worthwhile.)
|
|
||||||
|
|
||||||
How can I more easily see gathers and scatters in generated assembly?
|
|
||||||
---------------------------------------------------------------------
|
|
||||||
|
|
||||||
Because CPU vector ISAs don't have native gather and scatter instructions,
|
|
||||||
these memory operations are turned into sequences of a series of
|
|
||||||
instructions in the code that ``ispc`` generates. In some cases, it can be
|
|
||||||
useful to see where gathers and scatters actually happen in code; there is
|
|
||||||
an otherwise undocumented command-line flag that provides this information.
|
|
||||||
|
|
||||||
Consider this simple program:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
void set(uniform int a[], int value, int index) {
|
|
||||||
a[index] = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
When compiled normally to the SSE4 target, this program generates this
|
|
||||||
extensive code sequence, which makes it more difficult to see what the
|
|
||||||
program is actually doing.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
"_set___uptr<Ui>ii":
|
|
||||||
pmulld LCPI0_0(%rip), %xmm1
|
|
||||||
movmskps %xmm2, %eax
|
|
||||||
testb $1, %al
|
|
||||||
je LBB0_2
|
|
||||||
movd %xmm1, %ecx
|
|
||||||
movd %xmm0, (%rcx,%rdi)
|
|
||||||
LBB0_2:
|
|
||||||
testb $2, %al
|
|
||||||
je LBB0_4
|
|
||||||
pextrd $1, %xmm1, %ecx
|
|
||||||
pextrd $1, %xmm0, (%rcx,%rdi)
|
|
||||||
LBB0_4:
|
|
||||||
testb $4, %al
|
|
||||||
je LBB0_6
|
|
||||||
pextrd $2, %xmm1, %ecx
|
|
||||||
pextrd $2, %xmm0, (%rcx,%rdi)
|
|
||||||
LBB0_6:
|
|
||||||
testb $8, %al
|
|
||||||
je LBB0_8
|
|
||||||
pextrd $3, %xmm1, %eax
|
|
||||||
pextrd $3, %xmm0, (%rax,%rdi)
|
|
||||||
LBB0_8:
|
|
||||||
ret
|
|
||||||
|
|
||||||
If this program is compiled with the
|
|
||||||
``--opt=disable-handle-pseudo-memory-ops`` command-line flag, then the
|
|
||||||
scatter is left as an unresolved function call. The resulting program
|
|
||||||
won't link without unresolved symbols, but the assembly output is much
|
|
||||||
easier to understand:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
"_set___uptr<Ui>ii":
|
|
||||||
movaps %xmm0, %xmm3
|
|
||||||
pmulld LCPI0_0(%rip), %xmm1
|
|
||||||
movdqa %xmm1, %xmm0
|
|
||||||
movaps %xmm3, %xmm1
|
|
||||||
jmp ___pseudo_scatter_base_offsets32_32 ## TAILCALL
|
|
||||||
|
|
||||||
|
|
||||||
Interoperability
|
|
||||||
================
|
|
||||||
|
|
||||||
How can I supply an initial execution mask in the call from the application?
|
|
||||||
----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
Recall that when execution transitions from the application code to an
|
|
||||||
``ispc`` function, all of the program instances are initially executing.
|
|
||||||
In some cases, it may desired that only some of them are running, based on
|
|
||||||
a data-dependent condition computed in the application program. This
|
|
||||||
situation can easily be handled via an additional parameter from the
|
|
||||||
application.
|
|
||||||
|
|
||||||
As a simple example, consider a case where the application code has an
|
|
||||||
array of ``float`` values and we'd like the ``ispc`` code to update
|
|
||||||
just specific values in that array, where which of those values to be
|
|
||||||
updated has been determined by the application. In C++ code, we might
|
|
||||||
have:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
int count = ...;
|
|
||||||
float *array = new float[count];
|
|
||||||
bool *shouldUpdate = new bool[count];
|
|
||||||
// initialize array and shouldUpdate
|
|
||||||
ispc_func(array, shouldUpdate, count);
|
|
||||||
|
|
||||||
Then, the ``ispc`` code could process this update as:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export void ispc_func(uniform float array[], uniform bool update[],
|
|
||||||
uniform int count) {
|
|
||||||
foreach (i = 0 ... count) {
|
|
||||||
cif (update[i] == true)
|
|
||||||
// update array[i+programIndex]...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(In this case a "coherent" if statement is likely to be worthwhile if the
|
|
||||||
``update`` array will tend to have sections that are either all-true or
|
|
||||||
all-false.)
|
|
||||||
|
|
||||||
How can I generate a single binary executable with support for multiple instruction sets?
|
|
||||||
-----------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
``ispc`` can also generate output that supports multiple target instruction
|
|
||||||
sets, also generating code that chooses the most appropriate one at runtime
|
|
||||||
if multiple targets are specified with the ``--target`` command-line
|
|
||||||
argument.
|
|
||||||
|
|
||||||
For example, if you run the command:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
|
|
||||||
|
|
||||||
Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
|
|
||||||
``foo_avx.o``, and ``foo.o``.[#]_ Link all of these into your executable, and
|
|
||||||
when you call a function in ``foo.ispc`` from your application code,
|
|
||||||
``ispc`` will determine which instruction sets are supported by the CPU the
|
|
||||||
code is running on and will call the most appropraite version of the
|
|
||||||
function available.
|
|
||||||
|
|
||||||
.. [#] Similarly, if you choose to generate assembly langauage output or
|
|
||||||
LLVM bitcode output, multiple versions of those files will be created.
|
|
||||||
|
|
||||||
In general, the version of the function that runs will be the one in the
|
|
||||||
most general instruction set that is supported by the system. If you only
|
|
||||||
compile SSE2 and SSE4 variants and run on a system that supports AVX, for
|
|
||||||
example, then the SSE4 variant will be executed. If the system doesn't
|
|
||||||
is not able to run any of the available variants of the function (for
|
|
||||||
example, trying to run a function that only has SSE4 and AVX variants on a
|
|
||||||
system that only supports SSE2), then the standard library ``abort()``
|
|
||||||
function will be called.
|
|
||||||
|
|
||||||
One subtlety is that all non-static global variables (if any) must have the
|
|
||||||
same size and layout with all of the targets used. For example, if you
|
|
||||||
have the global variables:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
uniform int foo[2*programCount];
|
|
||||||
int bar;
|
|
||||||
|
|
||||||
and compile to both SSE2 and AVX targets, both of these variables will have
|
|
||||||
different sizes (the first due to program count having the value 4 for SSE2
|
|
||||||
and 8 for AVX, and the second due to ``varying`` types having different
|
|
||||||
numbers of elements with the two targets--essentially the same issue as the
|
|
||||||
first.) ``ispc`` issues an error in this case.
|
|
||||||
|
|
||||||
|
|
||||||
How can I determine at run-time which vector instruction set's instructions were selected to execute?
|
|
||||||
-----------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
``ispc`` doesn't provide any API that allows querying which vector ISA's
|
|
||||||
instructions are running when multi-target compilation was used. However,
|
|
||||||
this can be solved in "user space" by writing a small helper function.
|
|
||||||
Specifically, if you implement a function like this
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export uniform int isa() {
|
|
||||||
#if defined(ISPC_TARGET_SSE2)
|
|
||||||
return 0;
|
|
||||||
#elif defined(ISPC_TARGET_SSE4)
|
|
||||||
return 1;
|
|
||||||
#elif defined(ISPC_TARGET_AVX)
|
|
||||||
return 2;
|
|
||||||
#else
|
|
||||||
return -1;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
And then call it from your application code at runtime, it will return 0,
|
|
||||||
1, or 2, depending on which target's instructions are running.
|
|
||||||
|
|
||||||
The way this works is a little surprising, but it's a useful trick. Of
|
|
||||||
course the preprocessor ``#if`` checks are all compile-time only
|
|
||||||
operations. What's actually happening is that the function is compiled
|
|
||||||
multiple times, once for each target, with the appropriate ``ISPC_TARGET``
|
|
||||||
preprocessor symbol set. Then, a small dispatch function is generated for
|
|
||||||
the application to actually call. This dispatch function in turn calls the
|
|
||||||
appropriate version of the function based on the CPU of the system it's
|
|
||||||
executing on, which in turn returns the appropriate value.
|
|
||||||
|
|
||||||
In a similar fashion, it's possible to find out at run-time the value of
|
|
||||||
``programCount`` for the target that's actually being used.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export uniform int width() { return programCount; }
|
|
||||||
|
|
||||||
|
|
||||||
Programming Techniques
|
|
||||||
======================
|
|
||||||
|
|
||||||
What primitives are there for communicating between SPMD program instances?
|
|
||||||
---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
|
|
||||||
routines provide a variety of mechanisms for the running program instances
|
|
||||||
to communicate values to each other during execution. Note that there's no
|
|
||||||
need to synchronize the program instances before communicating between
|
|
||||||
them, due to the synchronized execution model of gangs of program instances
|
|
||||||
in ``ispc``.
|
|
||||||
|
|
||||||
How can a gang of program instances generate variable amounts of output efficiently?
|
|
||||||
------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
It's not unusual to have a gang of program instances where each program
|
|
||||||
instance generates a variable amount of output (perhaps some generate no
|
|
||||||
output, some generate one output value, some generate many output values
|
|
||||||
and so forth), and where one would like to have the output densely packed
|
|
||||||
in an output array. The ``exclusive_scan_add()`` function from the
|
|
||||||
standard library is quite useful in this situation.
|
|
||||||
|
|
||||||
Consider the following function:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
uniform int func(uniform float outArray[], ...) {
|
|
||||||
int numOut = ...; // figure out how many to be output
|
|
||||||
float outLocal[MAX_OUT]; // staging area
|
|
||||||
|
|
||||||
// each program instance in the gang puts its results in
|
|
||||||
// outLocal[0], ..., outLocal[numOut-1]
|
|
||||||
|
|
||||||
int startOffset = exclusive_scan_add(numOut);
|
|
||||||
for (int i = 0; i < numOut; ++i)
|
|
||||||
outArray[startOffset + i] = outLocal[i];
|
|
||||||
return reduce_add(numOut);
|
|
||||||
}
|
|
||||||
|
|
||||||
Here, each program instance has computed a number, ``numOut``, of values to
|
|
||||||
output, and has stored them in the ``outLocal`` array. Assume that four
|
|
||||||
program instances are running and that the first one wants to output one
|
|
||||||
value, the second two values, and the third and fourth three values each.
|
|
||||||
In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
|
|
||||||
to the four program instances, respectively.
|
|
||||||
|
|
||||||
The first program instance will then write its one result to
|
|
||||||
``outArray[0]``, the second will write its two values to ``outArray[1]``
|
|
||||||
and ``outArray[2]``, and so forth. The ``reduce_add()`` call at the end
|
|
||||||
returns the total number of values that all of the program instances have
|
|
||||||
written to the array.
|
|
||||||
|
|
||||||
FIXME: add discussion of foreach_active as an option here once that's in
|
|
||||||
|
|
||||||
Is it possible to use ispc for explicit vector programming?
|
|
||||||
-----------------------------------------------------------
|
|
||||||
|
|
||||||
The typical model for programming in ``ispc`` is an *implicit* parallel
|
|
||||||
model, where one writes a program that is apparently doing scalar
|
|
||||||
computation on values and the program is then vectorized to run in parallel
|
|
||||||
across the SIMD lanes of a processor. However, ``ispc`` also has some
|
|
||||||
support for explicit vector unit programming, where the vectorization is
|
|
||||||
explicit. Some computations may be more effectively described in the
|
|
||||||
explicit model rather than the implicit model.
|
|
||||||
|
|
||||||
This support is provided via ``uniform`` instances of short vectors
|
|
||||||
Specifically, if this short program
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export uniform float<8> madd(uniform float<8> a, uniform float<8> b,
|
|
||||||
uniform float<8> c) {
|
|
||||||
return a + b * c;
|
|
||||||
}
|
|
||||||
|
|
||||||
is compiled with the AVX target, ``ispc`` generates the following assembly:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
_madd:
|
|
||||||
vmulps %ymm2, %ymm1, %ymm1
|
|
||||||
vaddps %ymm0, %ymm1, %ymm0
|
|
||||||
ret
|
|
||||||
|
|
||||||
(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
|
|
||||||
``addps`` instructions are generated, and so forth.)
|
|
||||||
|
|
||||||
Note that ``ispc`` doesn't currently support control-flow based on
|
|
||||||
``uniform`` short vector types; it is thus not possible to write code like:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
|
|
||||||
uniform int<8> sum = 0;
|
|
||||||
while (a++ < b)
|
|
||||||
++sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
How can I debug my ispc programs using Valgrind?
|
|
||||||
------------------------------------------------
|
|
||||||
|
|
||||||
The `valgrind`_ memory checker is an extremely useful memory checker for
|
|
||||||
Linux and OSX; it detects a range of memory errors, including accessing
|
|
||||||
memory after it has been freed, accessing memory beyond the end of an
|
|
||||||
array, accessing uninitialized stack variables, and so forth.
|
|
||||||
In general, applications that use ``ispc`` code run with ``valgrind``
|
|
||||||
without modification and ``valgrind`` will detect the same range of memory
|
|
||||||
errors in ``ispc`` code that it does in C/C++ code.
|
|
||||||
|
|
||||||
.. _valgrind: http://valgrind.org
|
|
||||||
|
|
||||||
One issue to be aware of is that until recently, ``valgrind`` only
|
|
||||||
supported the SSE2 vector instructions; if you are using a version of
|
|
||||||
``valgrind`` older than the 3.7.0 release (5 November 2011), you should
|
|
||||||
compile your ``ispc`` programs with ``--target=sse2`` before running them
|
|
||||||
through ``valgrind``. (Note that if no target is specified, then ``ispc``
|
|
||||||
chooses a target based on the capabilities of the system you're running
|
|
||||||
``ispc`` on.) If you run an ``ispc`` program that uses instructions that
|
|
||||||
``valgrind`` doesn't support, you'll see an error message like:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
vex amd64->IR: unhandled instruction bytes: 0xC5 0xFA 0x10 0x0 0xC5 0xFA 0x11 0x84
|
|
||||||
==46059== valgrind: Unrecognised instruction at address 0x100002707.
|
|
||||||
|
|
||||||
The just-released valgrind 3.7.0 adds support for the SSE4.2 instruction
|
|
||||||
set; if you're using that version (and your system supports SSE4.2), then
|
|
||||||
you can use ``--target=sse4`` when compiling to run with ``valgrind``.
|
|
||||||
|
|
||||||
Note that ``valgrind`` does not yet support programs that use the AVX
|
|
||||||
instruction set.
|
|
||||||
|
|
||||||
3217
docs/ispc.txt
3217
docs/ispc.txt
File diff suppressed because it is too large
Load Diff
@@ -1,85 +0,0 @@
|
|||||||
===========
|
|
||||||
Performance
|
|
||||||
===========
|
|
||||||
|
|
||||||
The SPMD programming model that ``ispc`` makes it easy to harness the
|
|
||||||
computational power available in SIMD vector units on modern CPUs, while
|
|
||||||
its basis in C makes it easy for programmers to adopt and use
|
|
||||||
productively. This page summarizes the performance of ``ispc`` with the
|
|
||||||
workloads in the ``examples/`` directory of the ``ispc`` distribution.
|
|
||||||
|
|
||||||
These results were measured on a 4-core Apple iMac with a 4-core 3.4GHz
|
|
||||||
Intel® Core-i7 processor using the Intel® AVX instruction set. The basis
|
|
||||||
for comparison is a reference C++ implementation compiled with gcc 4.2.1,
|
|
||||||
the version distributed with OS X 10.7.2. (The reference implementation is
|
|
||||||
also included in the ``examples/`` directory.)
|
|
||||||
|
|
||||||
.. list-table:: Performance of ``ispc`` with a variety of the workloads
|
|
||||||
from the ``examples/`` directory of the ``ispc`` distribution, compared
|
|
||||||
a reference C++ implementation compiled with gcc 4.2.1.
|
|
||||||
|
|
||||||
* - Workload
|
|
||||||
- ``ispc``, 1 core
|
|
||||||
- ``ispc``, 4 cores
|
|
||||||
* - `AOBench`_ (512 x 512 resolution)
|
|
||||||
- 3.99x
|
|
||||||
- 19.32x
|
|
||||||
* - `Binomial Options`_ (128k options)
|
|
||||||
- 7.94x
|
|
||||||
- 33.43x
|
|
||||||
* - `Black-Scholes Options`_ (128k options)
|
|
||||||
- 8.45x
|
|
||||||
- 32.48x
|
|
||||||
* - `Deferred Shading`_ (1280p)
|
|
||||||
- n/a
|
|
||||||
- 23.06x
|
|
||||||
* - `Mandelbrot Set`_
|
|
||||||
- 6.21x
|
|
||||||
- 19.90x
|
|
||||||
* - `Perlin Noise Function`_
|
|
||||||
- 5.37x
|
|
||||||
- n/a
|
|
||||||
* - `Ray Tracer`_ (Sponza dataset)
|
|
||||||
- 3.99x
|
|
||||||
- 19.32x
|
|
||||||
* - `3D Stencil`_
|
|
||||||
- 3.76x
|
|
||||||
- 13.79x
|
|
||||||
* - `Volume Rendering`_
|
|
||||||
- 3.11x
|
|
||||||
- 15.80x
|
|
||||||
|
|
||||||
|
|
||||||
.. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
|
|
||||||
.. _Binomial Options: https://github.com/ispc/ispc/tree/master/examples/options
|
|
||||||
.. _Black-Scholes Options: https://github.com/ispc/ispc/tree/master/examples/options
|
|
||||||
.. _Deferred Shading: https://github.com/ispc/ispc/tree/master/examples/deferred
|
|
||||||
.. _Mandelbrot Set: https://github.com/ispc/ispc/tree/master/examples/mandelbrot_tasks
|
|
||||||
.. _Ray Tracer: https://github.com/ispc/ispc/tree/master/examples/rt
|
|
||||||
.. _Perlin Noise Function: https://github.com/ispc/ispc/tree/master/examples/noise
|
|
||||||
.. _3D Stencil: https://github.com/ispc/ispc/tree/master/examples/stencil
|
|
||||||
.. _Volume Rendering: https://github.com/ispc/ispc/tree/master/examples/volume_rendering
|
|
||||||
|
|
||||||
|
|
||||||
The following table shows speedups for a number of the examples on a
|
|
||||||
2.40GHz, 40-core Intel® Xeon E7-8870 system with the Intel® SSE4
|
|
||||||
instruction set, running Microsoft Windows Server 2008 Enterprise. Here,
|
|
||||||
the serial C/C++ baseline code was compiled with MSVC 2010.
|
|
||||||
|
|
||||||
.. list-table:: Performance of ``ispc`` with a variety of the workloads
|
|
||||||
from the ``examples/`` directory of the ``ispc`` distribution, on
|
|
||||||
system with 40 CPU cores.
|
|
||||||
|
|
||||||
* - Workload
|
|
||||||
- ``ispc``, 40 cores
|
|
||||||
* - AOBench (2048 x 2048 resolution)
|
|
||||||
- 182.36x
|
|
||||||
* - Binomial Options (2m options)
|
|
||||||
- 63.85x
|
|
||||||
* - Black-Scholes Options (2m options)
|
|
||||||
- 83.97x
|
|
||||||
* - Ray Tracer (Sponza dataset)
|
|
||||||
- 195.67x
|
|
||||||
* - Volume Rendering
|
|
||||||
- 243.18x
|
|
||||||
|
|
||||||
@@ -1,714 +0,0 @@
|
|||||||
==============================================
|
|
||||||
Intel® SPMD Program Compiler Performance Guide
|
|
||||||
==============================================
|
|
||||||
|
|
||||||
The SPMD programming model provided by ``ispc`` naturally delivers
|
|
||||||
excellent performance for many workloads thanks to efficient use of CPU
|
|
||||||
SIMD vector hardware. This guide provides more details about how to get
|
|
||||||
the most out of ``ispc`` in practice.
|
|
||||||
|
|
||||||
* `Key Concepts`_
|
|
||||||
|
|
||||||
+ `Efficient Iteration With "foreach"`_
|
|
||||||
+ `Improving Control Flow Coherence With "foreach_tiled"`_
|
|
||||||
+ `Using Coherent Control Flow Constructs`_
|
|
||||||
+ `Use "uniform" Whenever Appropriate`_
|
|
||||||
|
|
||||||
* `Tips and Techniques`_
|
|
||||||
|
|
||||||
+ `Understanding Gather and Scatter`_
|
|
||||||
+ `Avoid 64-bit Addressing Calculations When Possible`_
|
|
||||||
+ `Avoid Computation With 8 and 16-bit Integer Types`_
|
|
||||||
+ `Implementing Reductions Efficiently`_
|
|
||||||
+ `Using Low-level Vector Tricks`_
|
|
||||||
+ `The "Fast math" Option`_
|
|
||||||
+ `"inline" Aggressively`_
|
|
||||||
+ `Avoid The System Math Library`_
|
|
||||||
+ `Declare Variables In The Scope Where They're Used`_
|
|
||||||
+ `Instrumenting ISPC Programs To Understand Runtime Behavior`_
|
|
||||||
+ `Choosing A Target Vector Width`_
|
|
||||||
|
|
||||||
* `Disclaimer and Legal Information`_
|
|
||||||
|
|
||||||
* `Optimization Notice`_
|
|
||||||
|
|
||||||
Key Concepts
|
|
||||||
============
|
|
||||||
|
|
||||||
This section describes the four most important concepts to understand and
|
|
||||||
keep in mind when writing high-performance ``ispc`` programs. It assumes
|
|
||||||
good familiarity with the topics covered in the ``ispc`` `Users Guide`_.
|
|
||||||
|
|
||||||
.. _Users Guide: ispc.html
|
|
||||||
|
|
||||||
Efficient Iteration With "foreach"
|
|
||||||
----------------------------------
|
|
||||||
|
|
||||||
The ``foreach`` parallel iteration construct is semantically equivalent to
|
|
||||||
a regular ``for()`` loop, though it offers meaningful performance benefits.
|
|
||||||
(See the `documentation on "foreach" in the Users Guide`_ for a review of
|
|
||||||
its syntax and semantics.) As an example, consider this simple function
|
|
||||||
that iterates over some number of elements in an array, doing computation
|
|
||||||
on each one:
|
|
||||||
|
|
||||||
.. _documentation on "foreach" in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export void foo(uniform int a[], uniform int count) {
|
|
||||||
for (int i = programIndex; i < count; i += programCount) {
|
|
||||||
// do some computation on a[i]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Depending on the specifics of the computation being performed, the code
|
|
||||||
generated for this function could likely be improved by modifying the code
|
|
||||||
so that the loop only goes as far through the data as is possible to pack
|
|
||||||
an entire gang of program instances with computation each time thorugh the
|
|
||||||
loop. Doing so enables the ``ispc`` compiler to generate more efficient
|
|
||||||
code for cases where it knows that the execution mask is "all on". Then,
|
|
||||||
an ``if`` statement at the end handles processing the ragged extra bits of
|
|
||||||
data that didn't fully fill a gang.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export void foo(uniform int a[], uniform int count) {
|
|
||||||
// First, just loop up to the point where all program instances
|
|
||||||
// in the gang will be active at the loop iteration start
|
|
||||||
uniform int countBase = count & ~(programCount-1);
|
|
||||||
for (uniform int i = 0; i < countBase; i += programCount) {
|
|
||||||
int index = i + programIndex;
|
|
||||||
// do some computation on a[index]
|
|
||||||
}
|
|
||||||
// Now handle the ragged extra bits at the end
|
|
||||||
if (countBase < count) {
|
|
||||||
int index = countBase + programIndex;
|
|
||||||
// do some computation on a[index]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
While the performance of the above code will likely be better than the
|
|
||||||
first version of the function, the loop body code has been duplicated (or
|
|
||||||
has been forced to move into a separate utility function).
|
|
||||||
|
|
||||||
Using the ``foreach`` looping construct as below provides all of the
|
|
||||||
performance benefits of the second version of this function, with the
|
|
||||||
compactness of the first.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
export void foo(uniform int a[], uniform int count) {
|
|
||||||
foreach (i = 0 ... count) {
|
|
||||||
// do some computation on a[i]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Improving Control Flow Coherence With "foreach_tiled"
|
|
||||||
-----------------------------------------------------
|
|
||||||
|
|
||||||
Depending on the computation being performed, ``foreach_tiled`` may give
|
|
||||||
better performance than ``foreach``. (See the `documentation in the Users
|
|
||||||
Guide`_ for the syntax and semantics of ``foreach_tiled``.) Given a
|
|
||||||
multi-dimensional iteration like:
|
|
||||||
|
|
||||||
.. _documentation in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
foreach (i = 0 ... width, j = 0 ... height) {
|
|
||||||
// do computation on element (i,j)
|
|
||||||
}
|
|
||||||
|
|
||||||
if the ``foreach`` statement is used, elements in the gang of program
|
|
||||||
instances will be mapped to values of ``i`` and ``j`` by taking spans of
|
|
||||||
``programCount`` elements across ``i`` with a single value of ``j``. For
|
|
||||||
example, the ``foreach`` statement above roughly corresponds to:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
for (uniform int j = 0; j < height; ++j)
|
|
||||||
for (int i = 0; i < width; i += programCount) {
|
|
||||||
// do computation
|
|
||||||
}
|
|
||||||
|
|
||||||
When a multi-dimensional domain is being iterated over, ``foreach_tiled``
|
|
||||||
statement maps program instances to data in a way that tries to select
|
|
||||||
square n-dimensional segments of the domain. For example, on a compilation
|
|
||||||
target with 8-wide gangs of program instances, it generates code that
|
|
||||||
iterates over the domain the same way as the following code (though more
|
|
||||||
efficiently):
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
for (int j = programIndex/4; j < height; j += 2)
|
|
||||||
for (int i = programIndex%4; i < width; i += 4) {
|
|
||||||
// do computation
|
|
||||||
}
|
|
||||||
|
|
||||||
Thus, each gang of program instances operates on a 2x4 tile of the domain.
|
|
||||||
With higher-dimensional iteration and different gang sizes, a similar
|
|
||||||
mapping is performed--e.g. for 2D iteration with a 16-wide gang size, 4x4
|
|
||||||
tiles are iterated over; for 4D iteration with a 8-gang, 1x2x2x2 tiles are
|
|
||||||
processed, and so forth.
|
|
||||||
|
|
||||||
Performance benefit can come from using ``foreach_tiled`` in that it
|
|
||||||
essentially optimizes for the benefit of iterating over *compact* regions
|
|
||||||
of the domian (while ``foreach`` iterates over the domain in a way that
|
|
||||||
generally allows linear memory access.) There are two benefits from
|
|
||||||
processing compact regions of the domain.
|
|
||||||
|
|
||||||
First, it's often the case that the control flow coherence of the program
|
|
||||||
instances in the gang is improved; if data-dependent control flow decisions
|
|
||||||
are related to the values of the data in the domain being processed, and if
|
|
||||||
the data values have some coherence, iterating with compact regions will
|
|
||||||
improve control flow coherence.
|
|
||||||
|
|
||||||
Second, processing compact regions may mean that the data accessed by
|
|
||||||
program instances in the gang is be more coherent, leading to performance
|
|
||||||
benefits from better cache hit rates.
|
|
||||||
|
|
||||||
As a concrete example, for the ray tracer example in the ``ispc``
|
|
||||||
distribution (in the ``examples/rt`` directory), performance is 20% better
|
|
||||||
when the pixels are iterated over using ``foreach_tiled`` than ``foreach``,
|
|
||||||
because more coherent regions of the scene are accessed by the set of rays
|
|
||||||
in the gang of program instances.
|
|
||||||
|
|
||||||
|
|
||||||
Using Coherent Control Flow Constructs
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
Recall from the ``ispc`` Users Guide, in the `SPMD-on-SIMD Execution Model
|
|
||||||
section`_ that ``if`` statements with a ``uniform`` test compile to more
|
|
||||||
efficient code than ``if`` tests with varying tests. The coherent ``cif``
|
|
||||||
statement can provide many benefits of ``if`` with a uniform test in the
|
|
||||||
case where the test is actually varying.
|
|
||||||
|
|
||||||
.. _SPMD-on-SIMD Execution Model section: ispc.html#the-spmd-on-simd-execution-model
|
|
||||||
|
|
||||||
In this case, the code the compiler generates for the ``if``
|
|
||||||
test is along the lines of the following pseudo-code:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
bool expr = /* evaluate cif condition */
|
|
||||||
if (all(expr)) {
|
|
||||||
// run "true" case of if test only
|
|
||||||
} else if (!any(expr)) {
|
|
||||||
// run "false" case of if test only
|
|
||||||
} else {
|
|
||||||
// run both true and false cases, updating mask appropriately
|
|
||||||
}
|
|
||||||
|
|
||||||
For ``if`` statements where the different running SPMD program instances
|
|
||||||
don't have coherent values for the boolean ``if`` test, using ``cif``
|
|
||||||
introduces some additional overhead from the ``all`` and ``any`` tests as
|
|
||||||
well as the corresponding branches. For cases where the program
|
|
||||||
instances often do compute the same boolean value, this overhead is
|
|
||||||
worthwhile. If the control flow is in fact usually incoherent, this
|
|
||||||
overhead only costs performance.
|
|
||||||
|
|
||||||
In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, and ``cdo``
|
|
||||||
statements. These statements are semantically the same as the
|
|
||||||
corresponding non-"c"-prefixed functions.
|
|
||||||
|
|
||||||
Use "uniform" Whenever Appropriate
|
|
||||||
----------------------------------
|
|
||||||
|
|
||||||
For any variable that will always have the same value across all of the
|
|
||||||
program instances in a gang, declare the variable with the ``unfiorm``
|
|
||||||
qualifier. Doing so enables the ``ispc`` compiler to emit better code in
|
|
||||||
many different ways.
|
|
||||||
|
|
||||||
As a simple example, consider a ``for`` loop that always does the same
|
|
||||||
number of iterations:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
for (int i = 0; i < 10; ++i)
|
|
||||||
// do something ten times
|
|
||||||
|
|
||||||
If this is written with ``i`` as a ``varying`` variable, as above, there's
|
|
||||||
additional overhead in the code generated for the loop as the compiler
|
|
||||||
emits instructions to handle the possibilty of not all program instances
|
|
||||||
following the same control flow path (as might be the case if the loop
|
|
||||||
limit, 10, was itself a ``varying`` value.)
|
|
||||||
|
|
||||||
If the above loop is instead written with ``i`` ``uniform``, as:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
for (uniform int i = 0; i < 10; ++i)
|
|
||||||
// do something ten times
|
|
||||||
|
|
||||||
Then better code can be generated (and the loop possibly unrolled).
|
|
||||||
|
|
||||||
In some cases, the compiler may be able to detect simple cases like these,
|
|
||||||
but it's always best to provide the compiler with as much help as possible
|
|
||||||
to understand the actual form of your computation.
|
|
||||||
|
|
||||||
|
|
||||||
Tips and Techniques
|
|
||||||
===================
|
|
||||||
|
|
||||||
This section introduces a number of additional techniques that are worth
|
|
||||||
keeping in mind when writing ``ispc`` programs.
|
|
||||||
|
|
||||||
Understanding Gather and Scatter
|
|
||||||
--------------------------------
|
|
||||||
|
|
||||||
Memory reads and writes from the program instances in a gang that access
|
|
||||||
irregular memory locations (rather than a consecutive set of locations, or
|
|
||||||
a single location) can be relatively inefficient. As an example, consider
|
|
||||||
the "simple" array indexing calculation below:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
int i = ....;
|
|
||||||
uniform float x[10] = { ... };
|
|
||||||
float f = x[i];
|
|
||||||
|
|
||||||
Since the index ``i`` is a varying value, the program instances in the gang
|
|
||||||
will in general be reading different locations in the array ``x``. Because
|
|
||||||
current CPUs have a "gather" instruction, the ``ispc`` compiler has to
|
|
||||||
serialize these memory reads, performing a separate memory load for each
|
|
||||||
running program instance, packing the result into ``f``. (The analogous
|
|
||||||
case happens for a write into ``x[i]``.)
|
|
||||||
|
|
||||||
In many cases, gathers like these are unavoidable; the program instances
|
|
||||||
just need to access incoherent memory locations. However, if the array
|
|
||||||
index ``i`` actually has the same value for all of the program instances or
|
|
||||||
if it represents an access to a consecutive set of array locations, much
|
|
||||||
more efficient load and store instructions can be generated instead of
|
|
||||||
gathers and scatters, respectively.
|
|
||||||
|
|
||||||
In many cases, the ``ispc`` compiler is able to deduce that the memory
|
|
||||||
locations accessed by a varying index are either all the same or are
|
|
||||||
uniform. For example, given:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
uniform int x = ...;
|
|
||||||
int y = x;
|
|
||||||
return array[y];
|
|
||||||
|
|
||||||
The compiler is able to determine that all of the program instances are
|
|
||||||
loading from the same location, even though ``y`` is not a ``uniform``
|
|
||||||
variable. In this case, the compiler will transform this load to a regular
|
|
||||||
vector load, rather than a general gather.
|
|
||||||
|
|
||||||
Sometimes the running program instances will access a linear sequence of
|
|
||||||
memory locations; this happens most frequently when array indexing is done
|
|
||||||
based on the built-in ``programIndex`` variable. In many of these cases,
|
|
||||||
the compiler is also able to detect this case and then do a vector load.
|
|
||||||
For example, given:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
for (int i = programIndex; i < count; i += programCount)
|
|
||||||
// process array[i];
|
|
||||||
|
|
||||||
Regular vector loads and stores are issued for accesses to ``array[i]``.
|
|
||||||
|
|
||||||
Both of these cases have been ones where the compiler is able to determine
|
|
||||||
statically that the index has the same value at compile-time. It's
|
|
||||||
often the case that this determination can't be made at compile time, but
|
|
||||||
this is often the case at run time. The ``reduce_equal()`` function from
|
|
||||||
the standard library can be used in this case; it checks to see if the
|
|
||||||
given value is the same across over all of the running program instances,
|
|
||||||
returning true and its ``uniform`` value if so.
|
|
||||||
|
|
||||||
The following function shows the use of ``reduce_equal()`` to check for an
|
|
||||||
equal index at execution time and then either do a scalar load and
|
|
||||||
broadcast or a general gather.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
uniform float array[..] = { ... };
|
|
||||||
float value;
|
|
||||||
int i = ...;
|
|
||||||
uniform int ui;
|
|
||||||
if (reduce_equal(i, &ui) == true)
|
|
||||||
value = array[ui]; // scalar load + broadcast
|
|
||||||
else
|
|
||||||
value = array[i]; // gather
|
|
||||||
|
|
||||||
For a simple case like the one above, the overhead of doing the
|
|
||||||
``reduce_equal()`` check is likely not worthwhile compared to just always
|
|
||||||
doing a gather. In more complex cases, where a number of accesses are done
|
|
||||||
based on the index, it can be worth doing. See the example
|
|
||||||
``examples/volume_rendering`` in the ``ispc`` distribution for the use of
|
|
||||||
this technique in an instance where it is beneficial to performance.
|
|
||||||
|
|
||||||
Avoid 64-bit Addressing Calculations When Possible
|
|
||||||
--------------------------------------------------
|
|
||||||
|
|
||||||
Even when compiling to a 64-bit architecture target, ``ispc`` does many of
|
|
||||||
the addressing calculations in 32-bit precision by default--this behavior
|
|
||||||
can be overridden with the ``--addressing=64`` command-line argument. This
|
|
||||||
option should only be used if it's necessary to be able to address over 4GB
|
|
||||||
of memory in the ``ispc`` code, as it essentially doubles the cost of
|
|
||||||
memory addressing calculations in the generated code.
|
|
||||||
|
|
||||||
Avoid Computation With 8 and 16-bit Integer Types
|
|
||||||
-------------------------------------------------
|
|
||||||
|
|
||||||
The code generated for 8 and 16-bit integer types is generally not as
|
|
||||||
efficient as the code generated for 32-bit integer types. It is generally
|
|
||||||
worthwhile to use 32-bit integer types for intermediate computations, even
|
|
||||||
if the final result will be stored in a smaller integer type.
|
|
||||||
|
|
||||||
Implementing Reductions Efficiently
|
|
||||||
-----------------------------------
|
|
||||||
|
|
||||||
It's often necessary to compute a reduction over a data set--for example,
|
|
||||||
one might want to add all of the values in an array, compute their minimum,
|
|
||||||
etc. ``ispc`` provides a few capabilities that make it easy to efficiently
|
|
||||||
compute reductions like these. However, it's important to use these
|
|
||||||
capabilities appropriately for best results.
|
|
||||||
|
|
||||||
As an example, consider the task of computing the sum of all of the values
|
|
||||||
in an array. In C code, we might have:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
/* C implementation of a sum reduction */
|
|
||||||
float sum(const float array[], int count) {
|
|
||||||
float sum = 0;
|
|
||||||
for (int i = 0; i < count; ++i)
|
|
||||||
sum += array[i];
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
Exactly this computation could also be expressed as a purely uniform
|
|
||||||
computation in ``ispc``, though without any benefit from vectorization:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
/* inefficient ispc implementation of a sum reduction */
|
|
||||||
uniform float sum(const uniform float array[], uniform int count) {
|
|
||||||
uniform float sum = 0;
|
|
||||||
for (uniform int i = 0; i < count; ++i)
|
|
||||||
sum += array[i];
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
As a first try, one might try using the ``reduce_add()`` function from the
|
|
||||||
``ispc`` standard library; it takes a ``varying`` value and returns the sum
|
|
||||||
of that value across all of the active program instances.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
/* inefficient ispc implementation of a sum reduction */
|
|
||||||
uniform float sum(const uniform float array[], uniform int count) {
|
|
||||||
uniform float sum = 0;
|
|
||||||
foreach (i = 0 ... count)
|
|
||||||
sum += reduce_add(array[i+programIndex]);
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
This implementation loads a gang's worth of values from the array, one for
|
|
||||||
each of the program instances, and then uses ``reduce_add()`` to reduce
|
|
||||||
across the program instances and then update the sum. Unfortunately this
|
|
||||||
approach loses most benefit from vectorization, as it does more work on the
|
|
||||||
cross-program instance ``reduce_add()`` call than it saves from the vector
|
|
||||||
load of values.
|
|
||||||
|
|
||||||
The most efficient approach is to do the reduction in two phases: rather
|
|
||||||
than using a ``uniform`` variable to store the sum, we maintain a varying
|
|
||||||
value, such that each program instance is effectively computing a local
|
|
||||||
partial sum on the subset of array values that it has loaded from the
|
|
||||||
array. When the loop over array elements concludes, a single call to
|
|
||||||
``reduce_add()`` computes the final reduction across each of the program
|
|
||||||
instances' elements of ``sum``. This approach effectively compiles to a
|
|
||||||
single vector load and a single vector add for each loop iteration's of
|
|
||||||
values--very efficient code in the end.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
/* good ispc implementation of a sum reduction */
|
|
||||||
uniform float sum(const uniform float array[], uniform int count) {
|
|
||||||
float sum = 0;
|
|
||||||
foreach (i = 0 ... count)
|
|
||||||
sum += array[i+programIndex];
|
|
||||||
return reduce_add(sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
Using Low-level Vector Tricks
|
|
||||||
-----------------------------
|
|
||||||
|
|
||||||
Many low-level Intel® SSE and AVX coding constructs can be implemented in
|
|
||||||
``ispc`` code. The ``ispc`` standard library functions ``intbits()`` and
|
|
||||||
``floatbits()`` are often useful in this context. Recall that
|
|
||||||
``intbits()`` takes a ``float`` value and returns it as an integer where
|
|
||||||
the bits of the integer are the same as the bit representation in memory of
|
|
||||||
the ``float``. (In other words, it does *not* perform an integer to
|
|
||||||
floating-point conversion.) ``floatbits()``, then, performs the inverse
|
|
||||||
computation.
|
|
||||||
|
|
||||||
As an example of the use of these functions, the following code efficiently
|
|
||||||
reverses the sign of the given values.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
float flipsign(float a) {
|
|
||||||
unsigned int i = intbits(a);
|
|
||||||
i ^= 0x80000000;
|
|
||||||
return floatbits(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
This code compiles down to a single XOR instruction.
|
|
||||||
|
|
||||||
The "Fast math" Option
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
``ispc`` has a ``--opt=fast-math`` command-line flag that enables a number of
|
|
||||||
optimizations that may be undesirable in code where numerical precision is
|
|
||||||
critically important. For many graphics applications, for example, the
|
|
||||||
approximations introduced may be acceptable, however. The following two
|
|
||||||
optimizations are performed when ``--opt=fast-math`` is used. By default, the
|
|
||||||
``--opt=fast-math`` flag is off.
|
|
||||||
|
|
||||||
* Expressions like ``x / y``, where ``y`` is a compile-time constant, are
|
|
||||||
transformed to ``x * (1./y)``, where the inverse value of ``y`` is
|
|
||||||
precomputed at compile time.
|
|
||||||
|
|
||||||
* Expressions like ``x / y``, where ``y`` is not a compile-time constant,
|
|
||||||
are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the
|
|
||||||
approximate reciprocal instruction from the ``ispc`` standard library.
|
|
||||||
|
|
||||||
|
|
||||||
"inline" Aggressively
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
Inlining functions aggressively is generally beneficial for performance
|
|
||||||
with ``ispc``. Definitely use the ``inline`` qualifier for any short
|
|
||||||
functions (a few lines long), and experiment with it for longer functions.
|
|
||||||
|
|
||||||
Avoid The System Math Library
|
|
||||||
-----------------------------
|
|
||||||
|
|
||||||
The default math library for transcendentals and the like that ``ispc`` has
|
|
||||||
higher error than the system's math library, though is much more efficient
|
|
||||||
due to being vectorized across the program instances and due to the fact
|
|
||||||
that the functions can be inlined in the final code. (It generally has
|
|
||||||
errors in the range of 10ulps, while the system math library generally has
|
|
||||||
no more than 1ulp of error for transcendentals.)
|
|
||||||
|
|
||||||
If the ``--math-lib=system`` command-line option is used when compiling an
|
|
||||||
``ispc`` program, then calls to the system math library will be generated
|
|
||||||
instead. This option should only be used if the higher precision is
|
|
||||||
absolutely required as the performance impact of using it can be
|
|
||||||
significant.
|
|
||||||
|
|
||||||
Declare Variables In The Scope Where They're Used
|
|
||||||
-------------------------------------------------
|
|
||||||
|
|
||||||
Performance is slightly improved by declaring variables at the same block
|
|
||||||
scope where they are first used. For example, in code like the
|
|
||||||
following, if the lifetime of ``foo`` is only within the scope of the
|
|
||||||
``if`` clause, write the code like this:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
float func() {
|
|
||||||
....
|
|
||||||
if (x < y) {
|
|
||||||
float foo;
|
|
||||||
... use foo ...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Try not to write code as:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
float func() {
|
|
||||||
float foo;
|
|
||||||
....
|
|
||||||
if (x < y) {
|
|
||||||
... use foo ...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Doing so can reduce the amount of masked store instructions that the
|
|
||||||
compiler needs to generate.
|
|
||||||
|
|
||||||
Instrumenting ISPC Programs To Understand Runtime Behavior
|
|
||||||
----------------------------------------------------------
|
|
||||||
|
|
||||||
``ispc`` has an optional instrumentation feature that can help you
|
|
||||||
understand performance issues. If a program is compiled using the
|
|
||||||
``--instrument`` flag, the compiler emits calls to a function with the
|
|
||||||
following signature at various points in the program (for
|
|
||||||
example, at interesting points in the control flow, when scatters or
|
|
||||||
gathers happen.)
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
extern "C" {
|
|
||||||
void ISPCInstrument(const char *fn, const char *note,
|
|
||||||
int line, int mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
This function is passed the file name of the ``ispc`` file running, a short
|
|
||||||
note indicating what is happening, the line number in the source file, and
|
|
||||||
the current mask of active program instances in the gang. You must provide an
|
|
||||||
implementation of this function and link it in with your application.
|
|
||||||
|
|
||||||
For example, when the ``ispc`` program runs, this function might be called
|
|
||||||
as follows:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
|
|
||||||
|
|
||||||
This call indicates that at the currently executing program has just
|
|
||||||
entered the function defined at line 55 of the file ``foo.ispc``, with a
|
|
||||||
mask of all lanes currently executing (assuming a four-wide gang size
|
|
||||||
target machine).
|
|
||||||
|
|
||||||
For a fuller example of the utility of this functionality, see
|
|
||||||
``examples/aobench_instrumented`` in the ``ispc`` distribution. Ths
|
|
||||||
example includes an implementation of the ``ISPCInstrument()`` function
|
|
||||||
that collects aggregate data about the program's execution behavior.
|
|
||||||
|
|
||||||
When running this example, you will want to direct to the ``ao`` executable
|
|
||||||
to generate a low resolution image, because the instrumentation adds
|
|
||||||
substantial execution overhead. For example:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
% ./ao 1 32 32
|
|
||||||
|
|
||||||
After the ``ao`` program exits, a summary report along the following lines
|
|
||||||
will be printed. In the first few lines, you can see how many times a few
|
|
||||||
functions were called, and the average percentage of SIMD lanes that were
|
|
||||||
active upon function entry.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
|
|
||||||
ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
|
|
||||||
ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
|
|
||||||
ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
|
|
||||||
ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes
|
|
||||||
ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
Choosing A Target Vector Width
|
|
||||||
------------------------------
|
|
||||||
|
|
||||||
By default, ``ispc`` compiles to the natural vector width of the target
|
|
||||||
instruction set. For example, for SSE2 and SSE4, it compiles four-wide,
|
|
||||||
and for AVX, it complies 8-wide. For some programs, higher performance may
|
|
||||||
be seen if the program is compiled to a doubled vector width--8-wide for
|
|
||||||
SSE and 16-wide for AVX.
|
|
||||||
|
|
||||||
For workloads that don't require many of registers, this method can lead to
|
|
||||||
significantly more efficient execution thanks to greater instruction level
|
|
||||||
parallelism and amortization of various overhead over more program
|
|
||||||
instances. For other workloads, it may lead to a slowdown due to higher
|
|
||||||
register pressure; trying both approaches for key kernels may be
|
|
||||||
worthwhile.
|
|
||||||
|
|
||||||
This option is only available for each of the SSE2, SSE4 and AVX targets.
|
|
||||||
It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and
|
|
||||||
``--target=avx-x2`` options, respectively.
|
|
||||||
|
|
||||||
|
|
||||||
Disclaimer and Legal Information
|
|
||||||
================================
|
|
||||||
|
|
||||||
INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS.
|
|
||||||
NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL
|
|
||||||
PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS
|
|
||||||
AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER,
|
|
||||||
AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE
|
|
||||||
OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT
|
|
||||||
OR OTHER INTELLECTUAL PROPERTY RIGHT.
|
|
||||||
|
|
||||||
UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED
|
|
||||||
NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD
|
|
||||||
CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR.
|
|
||||||
|
|
||||||
Intel may make changes to specifications and product descriptions at any time,
|
|
||||||
without notice. Designers must not rely on the absence or characteristics of any
|
|
||||||
features or instructions marked "reserved" or "undefined." Intel reserves these
|
|
||||||
for future definition and shall have no responsibility whatsoever for conflicts
|
|
||||||
or incompatibilities arising from future changes to them. The information here
|
|
||||||
is subject to change without notice. Do not finalize a design with this
|
|
||||||
information.
|
|
||||||
|
|
||||||
The products described in this document may contain design defects or errors
|
|
||||||
known as errata which may cause the product to deviate from published
|
|
||||||
specifications. Current characterized errata are available on request.
|
|
||||||
|
|
||||||
Contact your local Intel sales office or your distributor to obtain the latest
|
|
||||||
specifications and before placing your product order.
|
|
||||||
|
|
||||||
Copies of documents which have an order number and are referenced in this
|
|
||||||
document, or other Intel literature, may be obtained by calling 1-800-548-4725,
|
|
||||||
or by visiting Intel's Web Site.
|
|
||||||
|
|
||||||
Intel processor numbers are not a measure of performance. Processor numbers
|
|
||||||
differentiate features within each processor family, not across different
|
|
||||||
processor families. See http://www.intel.com/products/processor_number for
|
|
||||||
details.
|
|
||||||
|
|
||||||
BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom,
|
|
||||||
Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile,
|
|
||||||
i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4,
|
|
||||||
IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside,
|
|
||||||
Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst,
|
|
||||||
Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep,
|
|
||||||
Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium,
|
|
||||||
Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside,
|
|
||||||
skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon,
|
|
||||||
and Xeon Inside are trademarks of Intel Corporation in the U.S. and other
|
|
||||||
countries.
|
|
||||||
|
|
||||||
* Other names and brands may be claimed as the property of others.
|
|
||||||
|
|
||||||
Copyright(C) 2011, Intel Corporation. All rights reserved.
|
|
||||||
|
|
||||||
|
|
||||||
Optimization Notice
|
|
||||||
===================
|
|
||||||
|
|
||||||
Intel compilers, associated libraries and associated development tools may
|
|
||||||
include or utilize options that optimize for instruction sets that are
|
|
||||||
available in both Intel and non-Intel microprocessors (for example SIMD
|
|
||||||
instruction sets), but do not optimize equally for non-Intel
|
|
||||||
microprocessors. In addition, certain compiler options for Intel
|
|
||||||
compilers, including some that are not specific to Intel
|
|
||||||
micro-architecture, are reserved for Intel microprocessors. For a detailed
|
|
||||||
description of Intel compiler options, including the instruction sets and
|
|
||||||
specific microprocessors they implicate, please refer to the "Intel
|
|
||||||
Compiler User and Reference Guides" under "Compiler Options." Many library
|
|
||||||
routines that are part of Intel compiler products are more highly optimized
|
|
||||||
for Intel microprocessors than for other microprocessors. While the
|
|
||||||
compilers and libraries in Intel compiler products offer optimizations for
|
|
||||||
both Intel and Intel-compatible microprocessors, depending on the options
|
|
||||||
you select, your code and other factors, you likely will get extra
|
|
||||||
performance on Intel microprocessors.
|
|
||||||
|
|
||||||
Intel compilers, associated libraries and associated development tools may
|
|
||||||
or may not optimize to the same degree for non-Intel microprocessors for
|
|
||||||
optimizations that are not unique to Intel microprocessors. These
|
|
||||||
optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2),
|
|
||||||
Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental
|
|
||||||
Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other
|
|
||||||
optimizations. Intel does not guarantee the availability, functionality,
|
|
||||||
or effectiveness of any optimization on microprocessors not manufactured by
|
|
||||||
Intel. Microprocessor-dependent optimizations in this product are intended
|
|
||||||
for use with Intel microprocessors.
|
|
||||||
|
|
||||||
While Intel believes our compilers and libraries are excellent choices to
|
|
||||||
assist in obtaining the best performance on Intel and non-Intel
|
|
||||||
microprocessors, Intel recommends that you evaluate other compilers and
|
|
||||||
libraries to determine which best meet your requirements. We hope to win
|
|
||||||
your business by striving to offer the best performance of any compiler or
|
|
||||||
library; please let us know if you find we do not.
|
|
||||||
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
%(head_prefix)s
|
|
||||||
%(head)s
|
|
||||||
<script type="text/javascript">
|
|
||||||
|
|
||||||
var _gaq = _gaq || [];
|
|
||||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
|
||||||
_gaq.push(['_trackPageview']);
|
|
||||||
|
|
||||||
(function() {
|
|
||||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
|
||||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
|
||||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
|
||||||
})();
|
|
||||||
|
|
||||||
</script>
|
|
||||||
%(stylesheet)s
|
|
||||||
%(body_prefix)s
|
|
||||||
<div id="wrap">
|
|
||||||
<div id="wrap2">
|
|
||||||
<div id="header">
|
|
||||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
|
||||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
|
||||||
the CPU</div>
|
|
||||||
</div>
|
|
||||||
<div id="nav">
|
|
||||||
<div id="nbar">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Overview</a></li>
|
|
||||||
<li><a href="features.html">Features</a></li>
|
|
||||||
<li><a href="downloads.html">Downloads</a></li>
|
|
||||||
<li><a href="documentation.html">Documentation</a></li>
|
|
||||||
<li id="selected"><a href="perf.html">Performance</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div id="content-wrap">
|
|
||||||
<div id="sidebar">
|
|
||||||
<div class="widgetspace">
|
|
||||||
<h1>Resources</h1>
|
|
||||||
<ul class="menu">
|
|
||||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
|
||||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
|
||||||
users mailing list</a></li>
|
|
||||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
|
||||||
developers mailing list</a></li>
|
|
||||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
|
||||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
|
||||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
|
||||||
<tt>ispc</tt> source code</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
%(body_pre_docinfo)s
|
|
||||||
%(docinfo)s
|
|
||||||
<div id="content">
|
|
||||||
%(body)s
|
|
||||||
</div>
|
|
||||||
<div class="clearfix"></div>
|
|
||||||
<div id="footer"> © 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
|
||||||
<!-- Please Do Not remove this link, thank u -->
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
%(body_suffix)s
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
%(head_prefix)s
|
|
||||||
%(head)s
|
|
||||||
<script type="text/javascript">
|
|
||||||
|
|
||||||
var _gaq = _gaq || [];
|
|
||||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
|
||||||
_gaq.push(['_trackPageview']);
|
|
||||||
|
|
||||||
(function() {
|
|
||||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
|
||||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
|
||||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
|
||||||
})();
|
|
||||||
|
|
||||||
</script>
|
|
||||||
%(stylesheet)s
|
|
||||||
%(body_prefix)s
|
|
||||||
<div id="wrap">
|
|
||||||
<div id="wrap2">
|
|
||||||
<div id="header">
|
|
||||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
|
||||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
|
||||||
the CPU</div>
|
|
||||||
</div>
|
|
||||||
<div id="nav">
|
|
||||||
<div id="nbar">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Overview</a></li>
|
|
||||||
<li><a href="features.html">Features</a></li>
|
|
||||||
<li><a href="downloads.html">Downloads</a></li>
|
|
||||||
<li id="selected"><a href="documentation.html">Documentation</a></li>
|
|
||||||
<li><a href="perf.html">Performance</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div id="content-wrap">
|
|
||||||
<div id="sidebar">
|
|
||||||
<div class="widgetspace">
|
|
||||||
<h1>Resources</h1>
|
|
||||||
<ul class="menu">
|
|
||||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
|
||||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
|
||||||
users mailing list</a></li>
|
|
||||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
|
||||||
developers mailing list</a></li>
|
|
||||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
|
||||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
|
||||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
|
||||||
<tt>ispc</tt> source code</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
%(body_pre_docinfo)s
|
|
||||||
%(docinfo)s
|
|
||||||
<div id="content">
|
|
||||||
%(body)s
|
|
||||||
</div>
|
|
||||||
<div class="clearfix"></div>
|
|
||||||
<div id="footer"> © 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
|
||||||
<!-- Please Do Not remove this link, thank u -->
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
%(body_suffix)s
|
|
||||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
|||||||
# This could be handy for archiving the generated documentation or
|
# This could be handy for archiving the generated documentation or
|
||||||
# if some version control system is used.
|
# if some version control system is used.
|
||||||
|
|
||||||
PROJECT_NUMBER = 1.1.0
|
PROJECT_NUMBER = 1.0.10
|
||||||
|
|
||||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||||
# base path where the generated documentation will be put.
|
# base path where the generated documentation will be put.
|
||||||
@@ -585,6 +585,7 @@ INPUT = builtins.h \
|
|||||||
ctx.h \
|
ctx.h \
|
||||||
decl.h \
|
decl.h \
|
||||||
expr.h \
|
expr.h \
|
||||||
|
gatherbuf.h \
|
||||||
ispc.h \
|
ispc.h \
|
||||||
llvmutil.h \
|
llvmutil.h \
|
||||||
module.h \
|
module.h \
|
||||||
@@ -597,6 +598,7 @@ INPUT = builtins.h \
|
|||||||
ctx.cpp \
|
ctx.cpp \
|
||||||
decl.cpp \
|
decl.cpp \
|
||||||
expr.cpp \
|
expr.cpp \
|
||||||
|
gatherbuf.cpp \
|
||||||
ispc.cpp \
|
ispc.cpp \
|
||||||
llvmutil.cpp \
|
llvmutil.cpp \
|
||||||
main.cpp \
|
main.cpp \
|
||||||
|
|||||||
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
|||||||
CXX=g++
|
CXX=g++
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||||
|
|
||||||
ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
|
|
||||||
objs/ao_ispc_avx.o
|
|
||||||
OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)
|
|
||||||
|
|
||||||
default: ao
|
default: ao
|
||||||
|
|
||||||
@@ -24,8 +20,8 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ ao
|
/bin/rm -rf objs *~ ao
|
||||||
|
|
||||||
ao: dirs $(OBJS) $(TASK_OBJ)
|
ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp
|
|||||||
|
|
||||||
objs/ao.o: objs/ao_ispc.h
|
objs/ao.o: objs/ao_ispc.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -55,6 +55,7 @@
|
|||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
|
|
||||||
#define NSUBSAMPLES 2
|
#define NSUBSAMPLES 2
|
||||||
|
|
||||||
@@ -104,6 +105,38 @@ savePPM(const char *fname, int w, int h)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
@@ -118,6 +151,8 @@ int main(int argc, char **argv)
|
|||||||
height = atoi (argv[3]);
|
height = atoi (argv[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
// Allocate space for output images
|
// Allocate space for output images
|
||||||
img = new unsigned char[width * height * 3];
|
img = new unsigned char[width * height * 3];
|
||||||
fimg = new float[width * height * 3];
|
fimg = new float[width * height * 3];
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void vnormalize(vec &v) {
|
static inline void vnormalize(reference vec v) {
|
||||||
float len2 = dot(v, v);
|
float len2 = dot(v, v);
|
||||||
float invlen = rsqrt(len2);
|
float invlen = rsqrt(len2);
|
||||||
v *= invlen;
|
v *= invlen;
|
||||||
@@ -83,7 +83,8 @@ static inline void vnormalize(vec &v) {
|
|||||||
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
ray_plane_intersect(reference Isect isect, reference Ray ray,
|
||||||
|
reference Plane plane) {
|
||||||
float d = -dot(plane.p, plane.n);
|
float d = -dot(plane.p, plane.n);
|
||||||
float v = dot(ray.dir, plane.n);
|
float v = dot(ray.dir, plane.n);
|
||||||
|
|
||||||
@@ -103,7 +104,8 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
|||||||
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
ray_sphere_intersect(reference Isect isect, reference Ray ray,
|
||||||
|
reference Sphere sphere) {
|
||||||
vec rs = ray.org - sphere.center;
|
vec rs = ray.org - sphere.center;
|
||||||
|
|
||||||
float B = dot(rs, ray.dir);
|
float B = dot(rs, ray.dir);
|
||||||
@@ -125,7 +127,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
|||||||
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
orthoBasis(vec basis[3], vec n) {
|
orthoBasis(reference vec basis[3], vec n) {
|
||||||
basis[2] = n;
|
basis[2] = n;
|
||||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||||
|
|
||||||
@@ -148,8 +150,8 @@ orthoBasis(vec basis[3], vec n) {
|
|||||||
|
|
||||||
|
|
||||||
static inline float
|
static inline float
|
||||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||||
RNGState &rngstate) {
|
reference Sphere spheres[3], reference RNGState rngstate) {
|
||||||
float eps = 0.0001f;
|
float eps = 0.0001f;
|
||||||
vec p, n;
|
vec p, n;
|
||||||
vec basis[3];
|
vec basis[3];
|
||||||
@@ -166,8 +168,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
|||||||
Ray ray;
|
Ray ray;
|
||||||
Isect occIsect;
|
Isect occIsect;
|
||||||
|
|
||||||
float theta = sqrt(frandom(&rngstate));
|
float theta = sqrt(frandom(rngstate));
|
||||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
float phi = 2.0f * M_PI * frandom(rngstate);
|
||||||
float x = cos(phi) * theta;
|
float x = cos(phi) * theta;
|
||||||
float y = sin(phi) * theta;
|
float y = sin(phi) * theta;
|
||||||
float z = sqrt(1.0 - theta * theta);
|
float z = sqrt(1.0 - theta * theta);
|
||||||
@@ -203,7 +205,7 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
|||||||
*/
|
*/
|
||||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||||
uniform int h, uniform int nsubsamples,
|
uniform int h, uniform int nsubsamples,
|
||||||
uniform float image[]) {
|
reference uniform float image[]) {
|
||||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||||
static Sphere spheres[3] = {
|
static Sphere spheres[3] = {
|
||||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||||
@@ -211,7 +213,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
|||||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||||
RNGState rngstate;
|
RNGState rngstate;
|
||||||
|
|
||||||
seed_rng(&rngstate, y0);
|
seed_rng(rngstate, y0);
|
||||||
|
|
||||||
// Compute the mapping between the 'programCount'-wide program
|
// Compute the mapping between the 'programCount'-wide program
|
||||||
// instances running in parallel and samples in the image.
|
// instances running in parallel and samples in the image.
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup Label="ProjectConfigurations">
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
<ProjectConfiguration Include="Debug|Win32">
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
@@ -26,18 +26,18 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="ao.ispc">
|
<CustomBuild Include="ao.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<PropertyGroup Label="Globals">
|
<PropertyGroup Label="Globals">
|
||||||
@@ -86,19 +86,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -107,7 +103,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -123,7 +118,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -141,7 +135,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -160,7 +153,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -173,4 +165,4 @@
|
|||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
CXX=g++ -m64
|
CXX=g++ -m64
|
||||||
CXXFLAGS=-Iobjs/ -g3 -Wall
|
CXXFLAGS=-Iobjs/ -g3 -Wall
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
|
ISPCFLAGS=-O2 --instrument --arch=x86-64
|
||||||
|
|
||||||
default: ao
|
default: ao
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
#pragma warning (disable: 4244)
|
#pragma warning (disable: 4244)
|
||||||
#pragma warning (disable: 4305)
|
#pragma warning (disable: 4305)
|
||||||
@@ -50,11 +51,12 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
|
||||||
#include "ao_instrumented_ispc.h"
|
#include "ao_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
#include "instrument.h"
|
#include "instrument.h"
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
|
|
||||||
#define NSUBSAMPLES 2
|
#define NSUBSAMPLES 2
|
||||||
|
|
||||||
@@ -102,6 +104,37 @@ savePPM(const char *fname, int w, int h)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
@@ -117,6 +150,8 @@ int main(int argc, char **argv)
|
|||||||
height = atoi (argv[3]);
|
height = atoi (argv[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
// Allocate space for output images
|
// Allocate space for output images
|
||||||
img = new unsigned char[width * height * 3];
|
img = new unsigned char[width * height * 3];
|
||||||
fimg = new float[width * height * 3];
|
fimg = new float[width * height * 3];
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void vnormalize(vec &v) {
|
static inline void vnormalize(reference vec v) {
|
||||||
float len2 = dot(v, v);
|
float len2 = dot(v, v);
|
||||||
float invlen = rsqrt(len2);
|
float invlen = rsqrt(len2);
|
||||||
v *= invlen;
|
v *= invlen;
|
||||||
@@ -83,7 +83,8 @@ static inline void vnormalize(vec &v) {
|
|||||||
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
ray_plane_intersect(reference Isect isect, reference Ray ray,
|
||||||
|
reference Plane plane) {
|
||||||
float d = -dot(plane.p, plane.n);
|
float d = -dot(plane.p, plane.n);
|
||||||
float v = dot(ray.dir, plane.n);
|
float v = dot(ray.dir, plane.n);
|
||||||
|
|
||||||
@@ -103,7 +104,8 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
|||||||
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
ray_sphere_intersect(reference Isect isect, reference Ray ray,
|
||||||
|
reference Sphere sphere) {
|
||||||
vec rs = ray.org - sphere.center;
|
vec rs = ray.org - sphere.center;
|
||||||
|
|
||||||
float B = dot(rs, ray.dir);
|
float B = dot(rs, ray.dir);
|
||||||
@@ -125,7 +127,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
|||||||
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
orthoBasis(vec basis[3], vec n) {
|
orthoBasis(reference vec basis[3], vec n) {
|
||||||
basis[2] = n;
|
basis[2] = n;
|
||||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||||
|
|
||||||
@@ -148,8 +150,8 @@ orthoBasis(vec basis[3], vec n) {
|
|||||||
|
|
||||||
|
|
||||||
static inline float
|
static inline float
|
||||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||||
RNGState &rngstate) {
|
reference Sphere spheres[3], reference RNGState rngstate) {
|
||||||
float eps = 0.0001f;
|
float eps = 0.0001f;
|
||||||
vec p, n;
|
vec p, n;
|
||||||
vec basis[3];
|
vec basis[3];
|
||||||
@@ -166,8 +168,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
|||||||
Ray ray;
|
Ray ray;
|
||||||
Isect occIsect;
|
Isect occIsect;
|
||||||
|
|
||||||
float theta = sqrt(frandom(&rngstate));
|
float theta = sqrt(frandom(rngstate));
|
||||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
float phi = 2.0f * M_PI * frandom(rngstate);
|
||||||
float x = cos(phi) * theta;
|
float x = cos(phi) * theta;
|
||||||
float y = sin(phi) * theta;
|
float y = sin(phi) * theta;
|
||||||
float z = sqrt(1.0 - theta * theta);
|
float z = sqrt(1.0 - theta * theta);
|
||||||
@@ -201,9 +203,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
|||||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||||
of width w and height h.
|
of width w and height h.
|
||||||
*/
|
*/
|
||||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||||
uniform int h, uniform int nsubsamples,
|
uniform int nsubsamples, reference uniform float image[]) {
|
||||||
uniform float image[]) {
|
|
||||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||||
static Sphere spheres[3] = {
|
static Sphere spheres[3] = {
|
||||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||||
@@ -211,7 +212,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
|||||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||||
RNGState rngstate;
|
RNGState rngstate;
|
||||||
|
|
||||||
seed_rng(&rngstate, y0);
|
seed_rng(rngstate, y0);
|
||||||
|
|
||||||
// Compute the mapping between the 'programCount'-wide program
|
// Compute the mapping between the 'programCount'-wide program
|
||||||
// instances running in parallel and samples in the image.
|
// instances running in parallel and samples in the image.
|
||||||
@@ -230,9 +231,6 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
|||||||
// direction we do per iteration and ny the number in y.
|
// direction we do per iteration and ny the number in y.
|
||||||
uniform int nx = 1, ny = 1;
|
uniform int nx = 1, ny = 1;
|
||||||
|
|
||||||
// FIXME: We actually need ny to be 1 regardless of the decomposition,
|
|
||||||
// since the task decomposition is one scanline high.
|
|
||||||
|
|
||||||
if (programCount == 8) {
|
if (programCount == 8) {
|
||||||
// Do two pixels at once in the x direction
|
// Do two pixels at once in the x direction
|
||||||
nx = 2;
|
nx = 2;
|
||||||
@@ -241,21 +239,19 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
|||||||
++du;
|
++du;
|
||||||
}
|
}
|
||||||
else if (programCount == 16) {
|
else if (programCount == 16) {
|
||||||
nx = 4;
|
// Two at once in both x and y
|
||||||
ny = 1;
|
nx = ny = 2;
|
||||||
if (programIndex >= 4 && programIndex < 8)
|
if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
|
||||||
++du;
|
++du;
|
||||||
if (programIndex >= 8 && programIndex < 12)
|
if (programIndex >= 8)
|
||||||
du += 2;
|
++dv;
|
||||||
if (programIndex >= 12)
|
|
||||||
du += 3;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now loop over all of the pixels, stepping in x and y as calculated
|
// Now loop over all of the pixels, stepping in x and y as calculated
|
||||||
// above. (Assumes that ny divides y and nx divides x...)
|
// above. (Assumes that ny divides y and nx divides x...)
|
||||||
for (uniform int y = y0; y < y1; y += ny) {
|
for (uniform int y = y0; y < y1; y += ny) {
|
||||||
for (uniform int x = 0; x < w; x += nx) {
|
for (uniform int x = 0; x < w; x += nx) {
|
||||||
// Figure out x,y pixel in NDC
|
// Figur out x,y pixel in NDC
|
||||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||||
float ret = 0.f;
|
float ret = 0.f;
|
||||||
@@ -297,7 +293,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
|||||||
|
|
||||||
// offset to the first pixel in the image
|
// offset to the first pixel in the image
|
||||||
uniform int offset = 3 * (y * w + x);
|
uniform int offset = 3 * (y * w + x);
|
||||||
for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
|
for (uniform int p = 0; p < programCount; p += 4, ++offset) {
|
||||||
// Get the four sample values for this pixel
|
// Get the four sample values for this pixel
|
||||||
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
||||||
retArray[p+3];
|
retArray[p+3];
|
||||||
@@ -319,15 +315,3 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
|||||||
uniform float image[]) {
|
uniform float image[]) {
|
||||||
ao_scanlines(0, h, w, h, nsubsamples, image);
|
ao_scanlines(0, h, w, h, nsubsamples, image);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void task ao_task(uniform int width, uniform int height,
|
|
||||||
uniform int nsubsamples, uniform float image[]) {
|
|
||||||
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
|
||||||
uniform float image[]) {
|
|
||||||
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -21,23 +21,22 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="ao.cpp" />
|
<ClCompile Include="ao.cpp" />
|
||||||
<ClCompile Include="instrument.cpp" />
|
<ClCompile Include="instrument.cpp" />
|
||||||
<ClCompile Include="../tasksys.cpp" />
|
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="ao.ispc">
|
<CustomBuild Include="ao.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<PropertyGroup Label="Globals">
|
<PropertyGroup Label="Globals">
|
||||||
@@ -86,23 +85,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -110,8 +101,7 @@
|
|||||||
</PrecompiledHeader>
|
</PrecompiledHeader>
|
||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -124,8 +114,7 @@
|
|||||||
</PrecompiledHeader>
|
</PrecompiledHeader>
|
||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -140,8 +129,7 @@
|
|||||||
<Optimization>MaxSpeed</Optimization>
|
<Optimization>MaxSpeed</Optimization>
|
||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -158,8 +146,7 @@
|
|||||||
<Optimization>MaxSpeed</Optimization>
|
<Optimization>MaxSpeed</Optimization>
|
||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -171,4 +158,4 @@
|
|||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
Copyright (c) 2011, Intel Corporation
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
@@ -31,35 +31,36 @@
|
|||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/** @file ast.cpp
|
#ifndef ISPC_CPUID_H
|
||||||
@brief
|
#define ISPC_CPUID_H 1
|
||||||
*/
|
|
||||||
|
|
||||||
#include "ast.h"
|
#ifdef _MSC_VER
|
||||||
#include "func.h"
|
// Provides a __cpuid() function with same signature as below
|
||||||
#include "sym.h"
|
#include <intrin.h>
|
||||||
|
#else
|
||||||
|
static void __cpuid(int info[4], int infoType) {
|
||||||
|
__asm__ __volatile__ ("cpuid"
|
||||||
|
: "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||||
|
: "0" (infoType));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
inline bool CPUSupportsSSE2() {
|
||||||
// ASTNode
|
int info[4];
|
||||||
|
__cpuid(info, 1);
|
||||||
ASTNode::~ASTNode() {
|
return (info[3] & (1 << 26)) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool CPUSupportsSSE4() {
|
||||||
///////////////////////////////////////////////////////////////////////////
|
int info[4];
|
||||||
// AST
|
__cpuid(info, 1);
|
||||||
|
return (info[2] & (1 << 19)) != 0;
|
||||||
void
|
|
||||||
AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
|
|
||||||
if (sym == NULL)
|
|
||||||
return;
|
|
||||||
functions.push_back(new Function(sym, args, code));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool CPUSupportsAVX() {
|
||||||
void
|
int info[4];
|
||||||
AST::GenerateIR() {
|
__cpuid(info, 1);
|
||||||
for (unsigned int i = 0; i < functions.size(); ++i)
|
return (info[2] & (1 << 28)) != 0;
|
||||||
functions[i]->GenerateIR();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // ISPC_CPUID_H
|
||||||
@@ -1,18 +1,22 @@
|
|||||||
|
|
||||||
ARCH = $(shell uname)
|
ARCH = $(shell uname)
|
||||||
|
|
||||||
TASK_CXX=../tasksys.cpp
|
TASK_CXX=../tasks_pthreads.cpp
|
||||||
TASK_LIB=-lpthread
|
TASK_LIB=-lpthread
|
||||||
|
|
||||||
|
ifeq ($(ARCH), Darwin)
|
||||||
|
TASK_CXX=../tasks_gcd.cpp
|
||||||
|
TASK_LIB=
|
||||||
|
endif
|
||||||
|
|
||||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||||
|
|
||||||
CXX=g++
|
CXX=g++
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
|
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast
|
||||||
|
|
||||||
OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
|
OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o
|
||||||
objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
|
|
||||||
objs/dynamic_c.o objs/dynamic_cilk.o
|
|
||||||
|
|
||||||
default: deferred_shading
|
default: deferred_shading
|
||||||
|
|
||||||
@@ -34,5 +38,5 @@ objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
|
|||||||
objs/%.o: ../%.cpp
|
objs/%.o: ../%.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -64,7 +64,7 @@
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
return _aligned_malloc(size, alignment);
|
return _aligned_malloc(size, alignment);
|
||||||
#endif
|
#endif
|
||||||
@@ -118,7 +118,6 @@ Framebuffer::clear() {
|
|||||||
memset(b, 0, nPixels);
|
memset(b, 0, nPixels);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
InputData *
|
InputData *
|
||||||
CreateInputDataFromFile(const char *path) {
|
CreateInputDataFromFile(const char *path) {
|
||||||
FILE *in = fopen(path, "rb");
|
FILE *in = fopen(path, "rb");
|
||||||
@@ -178,7 +177,8 @@ CreateInputDataFromFile(const char *path) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void DeleteInputData(InputData *input) {
|
void DeleteInputData(InputData *input)
|
||||||
|
{
|
||||||
lAlignedFree(input->chunk);
|
lAlignedFree(input->chunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -64,19 +64,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -101,7 +96,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -119,7 +113,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -138,7 +131,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -153,23 +145,23 @@
|
|||||||
<ClCompile Include="dynamic_c.cpp" />
|
<ClCompile Include="dynamic_c.cpp" />
|
||||||
<ClCompile Include="dynamic_cilk.cpp" />
|
<ClCompile Include="dynamic_cilk.cpp" />
|
||||||
<ClCompile Include="main.cpp" />
|
<ClCompile Include="main.cpp" />
|
||||||
<ClCompile Include="../tasksys.cpp" />
|
<ClCompile Include="../tasks_concrt.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="kernels.ispc">
|
<CustomBuild Include="kernels.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
|||||||
@@ -60,7 +60,7 @@
|
|||||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
return _aligned_malloc(size, alignment);
|
return _aligned_malloc(size, alignment);
|
||||||
#endif
|
#endif
|
||||||
@@ -141,10 +141,12 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
|||||||
{
|
{
|
||||||
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
||||||
float minZ, maxZ;
|
float minZ, maxZ;
|
||||||
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
|
ComputeZBounds(
|
||||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
|
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||||
cameraNear, cameraFar, &minZ, &maxZ);
|
zBuffer, gBufferWidth,
|
||||||
|
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||||
|
&minZ, &maxZ);
|
||||||
minZArray[tileX] = minZ;
|
minZArray[tileX] = minZ;
|
||||||
maxZArray[tileX] = maxZ;
|
maxZArray[tileX] = maxZ;
|
||||||
}
|
}
|
||||||
@@ -280,8 +282,8 @@ void InitDynamicC(InputData *input) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* We're going to split a tile into 4 sub-tiles. This function
|
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||||
reclassifies the tile's lights with respect to the sub-tiles. */
|
// should be able to handle programCount-sized load/stores.
|
||||||
static void
|
static void
|
||||||
SplitTileMinMax(
|
SplitTileMinMax(
|
||||||
int tileMidX, int tileMidY,
|
int tileMidX, int tileMidY,
|
||||||
@@ -337,7 +339,7 @@ SplitTileMinMax(
|
|||||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||||
|
|
||||||
// Test lights again against subtile z bounds
|
// Test lights again subtile z bounds
|
||||||
bool inFrustum[4];
|
bool inFrustum[4];
|
||||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
@@ -412,8 +414,7 @@ Float32ToUnorm8(float f) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline float
|
static inline float half_to_float_fast(uint16_t h) {
|
||||||
half_to_float_fast(uint16_t h) {
|
|
||||||
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
||||||
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
||||||
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||||
|
|||||||
@@ -31,7 +31,7 @@
|
|||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef __cilk
|
#ifdef __cilkplusplus
|
||||||
|
|
||||||
#include "deferred.h"
|
#include "deferred.h"
|
||||||
#include "kernels_ispc.h"
|
#include "kernels_ispc.h"
|
||||||
@@ -60,7 +60,7 @@
|
|||||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
return _aligned_malloc(size, alignment);
|
return _aligned_malloc(size, alignment);
|
||||||
#endif
|
#endif
|
||||||
@@ -395,4 +395,4 @@ DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __cilk
|
#endif // __cilkplusplus
|
||||||
|
|||||||
@@ -35,22 +35,22 @@
|
|||||||
|
|
||||||
struct InputDataArrays
|
struct InputDataArrays
|
||||||
{
|
{
|
||||||
uniform float * uniform zBuffer;
|
uniform float zBuffer[];
|
||||||
uniform unsigned int16 * uniform normalEncoded_x; // half float
|
uniform unsigned int16 normalEncoded_x[]; // half float
|
||||||
uniform unsigned int16 * uniform normalEncoded_y; // half float
|
uniform unsigned int16 normalEncoded_y[]; // half float
|
||||||
uniform unsigned int16 * uniform specularAmount; // half float
|
uniform unsigned int16 specularAmount[]; // half float
|
||||||
uniform unsigned int16 * uniform specularPower; // half float
|
uniform unsigned int16 specularPower[]; // half float
|
||||||
uniform unsigned int8 * uniform albedo_x; // unorm8
|
uniform unsigned int8 albedo_x[]; // unorm8
|
||||||
uniform unsigned int8 * uniform albedo_y; // unorm8
|
uniform unsigned int8 albedo_y[]; // unorm8
|
||||||
uniform unsigned int8 * uniform albedo_z; // unorm8
|
uniform unsigned int8 albedo_z[]; // unorm8
|
||||||
uniform float * uniform lightPositionView_x;
|
uniform float lightPositionView_x[];
|
||||||
uniform float * uniform lightPositionView_y;
|
uniform float lightPositionView_y[];
|
||||||
uniform float * uniform lightPositionView_z;
|
uniform float lightPositionView_z[];
|
||||||
uniform float * uniform lightAttenuationBegin;
|
uniform float lightAttenuationBegin[];
|
||||||
uniform float * uniform lightColor_x;
|
uniform float lightColor_x[];
|
||||||
uniform float * uniform lightColor_y;
|
uniform float lightColor_y[];
|
||||||
uniform float * uniform lightColor_z;
|
uniform float lightColor_z[];
|
||||||
uniform float * uniform lightAttenuationEnd;
|
uniform float lightAttenuationEnd[];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct InputHeader
|
struct InputHeader
|
||||||
@@ -66,6 +66,8 @@ struct InputHeader
|
|||||||
uniform int32 inputDataArrayOffsets[idaNum];
|
uniform int32 inputDataArrayOffsets[idaNum];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export void foo(reference InputHeader h) { }
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Common utility routines
|
// Common utility routines
|
||||||
@@ -77,7 +79,8 @@ dot3(float x, float y, float z, float a, float b, float c) {
|
|||||||
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
normalize3(float x, float y, float z, reference float ox,
|
||||||
|
reference float oy, reference float oz) {
|
||||||
float n = rsqrt(x*x + y*y + z*z);
|
float n = rsqrt(x*x + y*y + z*z);
|
||||||
ox = x * n;
|
ox = x * n;
|
||||||
oy = y * n;
|
oy = y * n;
|
||||||
@@ -97,6 +100,7 @@ Float32ToUnorm8(float f) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
static void
|
static void
|
||||||
ComputeZBounds(
|
ComputeZBounds(
|
||||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||||
@@ -108,17 +112,17 @@ ComputeZBounds(
|
|||||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||||
uniform float cameraNear, uniform float cameraFar,
|
uniform float cameraNear, uniform float cameraFar,
|
||||||
// Output
|
// Output
|
||||||
uniform float &minZ,
|
reference uniform float minZ,
|
||||||
uniform float &maxZ
|
reference uniform float maxZ
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
// Find Z bounds
|
// Find Z bounds
|
||||||
float laneMinZ = cameraFar;
|
float laneMinZ = cameraFar;
|
||||||
float laneMaxZ = cameraNear;
|
float laneMaxZ = cameraNear;
|
||||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||||
foreach (x = tileStartX ... tileEndX) {
|
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||||
// Unproject depth buffer Z value into view space
|
// Unproject depth buffer Z value into view space
|
||||||
float z = zBuffer[y * gBufferWidth + x];
|
float z = zBuffer[(y * gBufferWidth + x) + programIndex];
|
||||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||||
|
|
||||||
// Work out Z bounds for our samples
|
// Work out Z bounds for our samples
|
||||||
@@ -134,6 +138,8 @@ ComputeZBounds(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
|
// numLights must currently be a multiple of programCount (SIMD size)
|
||||||
export uniform int32
|
export uniform int32
|
||||||
IntersectLightsWithTileMinMax(
|
IntersectLightsWithTileMinMax(
|
||||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||||
@@ -152,7 +158,7 @@ IntersectLightsWithTileMinMax(
|
|||||||
uniform float light_positionView_z_array[],
|
uniform float light_positionView_z_array[],
|
||||||
uniform float light_attenuationEnd_array[],
|
uniform float light_attenuationEnd_array[],
|
||||||
// Output
|
// Output
|
||||||
uniform int32 tileLightIndices[]
|
reference uniform int32 tileLightIndices[]
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||||
@@ -194,7 +200,9 @@ IntersectLightsWithTileMinMax(
|
|||||||
|
|
||||||
uniform int32 tileNumLights = 0;
|
uniform int32 tileNumLights = 0;
|
||||||
|
|
||||||
foreach (lightIndex = 0 ... numLights) {
|
for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights;
|
||||||
|
baseLightIndex += programCount) {
|
||||||
|
int32 lightIndex = baseLightIndex + programIndex;
|
||||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||||
@@ -209,31 +217,32 @@ IntersectLightsWithTileMinMax(
|
|||||||
// don't actually need to mask the rest of this function - this is
|
// don't actually need to mask the rest of this function - this is
|
||||||
// just a greedy early-out. Could also structure all of this as
|
// just a greedy early-out. Could also structure all of this as
|
||||||
// nested if() statements, but this a bit easier to read
|
// nested if() statements, but this a bit easier to read
|
||||||
if (any(inFrustum)) {
|
if (!any(inFrustum))
|
||||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
continue;
|
||||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
|
||||||
|
|
||||||
d = light_positionView_z * frustumPlanes_z[0] +
|
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||||
light_positionView_x * frustumPlanes_xy[0];
|
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
|
||||||
|
|
||||||
d = light_positionView_z * frustumPlanes_z[1] +
|
d = light_positionView_z * frustumPlanes_z[0] +
|
||||||
light_positionView_x * frustumPlanes_xy[1];
|
light_positionView_x * frustumPlanes_xy[0];
|
||||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
d = light_positionView_z * frustumPlanes_z[2] +
|
d = light_positionView_z * frustumPlanes_z[1] +
|
||||||
light_positionView_y * frustumPlanes_xy[2];
|
light_positionView_x * frustumPlanes_xy[1];
|
||||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
d = light_positionView_z * frustumPlanes_z[3] +
|
d = light_positionView_z * frustumPlanes_z[2] +
|
||||||
light_positionView_y * frustumPlanes_xy[3];
|
light_positionView_y * frustumPlanes_xy[2];
|
||||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
|
d = light_positionView_z * frustumPlanes_z[3] +
|
||||||
|
light_positionView_y * frustumPlanes_xy[3];
|
||||||
|
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||||
|
|
||||||
// Pack and store intersecting lights
|
// Pack and store intersecting lights
|
||||||
cif (inFrustum) {
|
cif (inFrustum) {
|
||||||
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
|
tileNumLights += packed_store_active(tileLightIndices, tileNumLights,
|
||||||
lightIndex);
|
lightIndex);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -241,6 +250,8 @@ IntersectLightsWithTileMinMax(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
|
// numLights must currently be a multiple of programCount (SIMD size)
|
||||||
static uniform int32
|
static uniform int32
|
||||||
IntersectLightsWithTile(
|
IntersectLightsWithTile(
|
||||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||||
@@ -259,7 +270,7 @@ IntersectLightsWithTile(
|
|||||||
uniform float light_positionView_z_array[],
|
uniform float light_positionView_z_array[],
|
||||||
uniform float light_attenuationEnd_array[],
|
uniform float light_attenuationEnd_array[],
|
||||||
// Output
|
// Output
|
||||||
uniform int32 tileLightIndices[]
|
reference uniform int32 tileLightIndices[]
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
uniform float minZ, maxZ;
|
uniform float minZ, maxZ;
|
||||||
@@ -278,31 +289,32 @@ IntersectLightsWithTile(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
export void
|
export void
|
||||||
ShadeTile(
|
ShadeTile(
|
||||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||||
uniform InputDataArrays &inputData,
|
reference uniform InputDataArrays inputData,
|
||||||
// Camera data
|
// Camera data
|
||||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||||
// Light list
|
// Light list
|
||||||
uniform int32 tileLightIndices[],
|
reference uniform int32 tileLightIndices[],
|
||||||
uniform int32 tileNumLights,
|
uniform int32 tileNumLights,
|
||||||
// UI
|
// UI
|
||||||
uniform bool visualizeLightCount,
|
uniform bool visualizeLightCount,
|
||||||
// Output
|
// Output
|
||||||
uniform unsigned int8 framebuffer_r[],
|
reference uniform unsigned int8 framebuffer_r[],
|
||||||
uniform unsigned int8 framebuffer_g[],
|
reference uniform unsigned int8 framebuffer_g[],
|
||||||
uniform unsigned int8 framebuffer_b[]
|
reference uniform unsigned int8 framebuffer_b[]
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
if (tileNumLights == 0 || visualizeLightCount) {
|
if (tileNumLights == 0 || visualizeLightCount) {
|
||||||
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||||
foreach (x = tileStartX ... tileEndX) {
|
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||||
int32 framebufferIndex = (y * gBufferWidth + x);
|
int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
|
||||||
framebuffer_r[framebufferIndex] = c;
|
framebuffer_r[framebufferIndex] = c;
|
||||||
framebuffer_g[framebufferIndex] = c;
|
framebuffer_g[framebufferIndex] = c;
|
||||||
framebuffer_b[framebufferIndex] = c;
|
framebuffer_b[framebufferIndex] = c;
|
||||||
@@ -315,8 +327,9 @@ ShadeTile(
|
|||||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||||
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||||
|
|
||||||
foreach (x = tileStartX ... tileEndX) {
|
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||||
int32 gBufferOffset = y * gBufferWidth + x;
|
uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
|
||||||
|
int32 gBufferOffset = gBufferOffsetBase + programIndex;
|
||||||
|
|
||||||
// Reconstruct position and (negative) view vector from G-buffer
|
// Reconstruct position and (negative) view vector from G-buffer
|
||||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||||
@@ -326,7 +339,7 @@ ShadeTile(
|
|||||||
|
|
||||||
// Compute screen/clip-space position
|
// Compute screen/clip-space position
|
||||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||||
float positionScreen_x = (0.5f + (float)(x)) *
|
float positionScreen_x = (0.5f + (float)(x + programIndex)) *
|
||||||
twoOverGBufferWidth - 1.0f;
|
twoOverGBufferWidth - 1.0f;
|
||||||
|
|
||||||
// Unproject depth buffer Z value into view space
|
// Unproject depth buffer Z value into view space
|
||||||
@@ -466,21 +479,24 @@ ShadeTile(
|
|||||||
// Static decomposition
|
// Static decomposition
|
||||||
|
|
||||||
task void
|
task void
|
||||||
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
|
||||||
uniform InputHeader &inputHeader,
|
reference uniform InputHeader inputHeader,
|
||||||
uniform InputDataArrays &inputData,
|
reference uniform InputDataArrays inputData,
|
||||||
uniform int visualizeLightCount,
|
uniform int visualizeLightCount,
|
||||||
// Output
|
// Output
|
||||||
uniform unsigned int8 framebuffer_r[],
|
reference uniform unsigned int8 framebuffer_r[],
|
||||||
uniform unsigned int8 framebuffer_g[],
|
reference uniform unsigned int8 framebuffer_g[],
|
||||||
uniform unsigned int8 framebuffer_b[]) {
|
reference uniform unsigned int8 framebuffer_b[]) {
|
||||||
uniform int32 group_y = taskIndex / num_groups_x;
|
uniform int32 group_y = g / num_groups_x;
|
||||||
uniform int32 group_x = taskIndex % num_groups_x;
|
uniform int32 group_x = g % num_groups_x;
|
||||||
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||||
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||||
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||||
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||||
|
|
||||||
|
uniform int sTileNumLights = 0;
|
||||||
|
uniform int sTileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||||
|
|
||||||
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
||||||
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
||||||
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||||
@@ -488,9 +504,8 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
|||||||
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||||
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||||
|
|
||||||
// Light intersection: figure out which lights illuminate this tile.
|
// Light intersection
|
||||||
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
sTileNumLights =
|
||||||
uniform int numTileLights =
|
|
||||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||||
tile_start_y, tile_end_y,
|
tile_start_y, tile_end_y,
|
||||||
framebufferWidth, framebufferHeight,
|
framebufferWidth, framebufferHeight,
|
||||||
@@ -503,43 +518,41 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
|||||||
inputData.lightPositionView_y,
|
inputData.lightPositionView_y,
|
||||||
inputData.lightPositionView_z,
|
inputData.lightPositionView_z,
|
||||||
inputData.lightAttenuationEnd,
|
inputData.lightAttenuationEnd,
|
||||||
tileLightIndices);
|
sTileLightIndices);
|
||||||
|
|
||||||
// And now shade the tile, using the lights in tileLightIndices
|
|
||||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||||
framebufferWidth, framebufferHeight, inputData,
|
framebufferWidth, framebufferHeight, inputData,
|
||||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||||
tileLightIndices, numTileLights, visualizeLightCount,
|
sTileLightIndices, sTileNumLights, visualizeLightCount,
|
||||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export void
|
export void
|
||||||
RenderStatic(uniform InputHeader &inputHeader,
|
RenderStatic(reference uniform InputHeader inputHeader,
|
||||||
uniform InputDataArrays &inputData,
|
reference uniform InputDataArrays inputData,
|
||||||
uniform int visualizeLightCount,
|
uniform int visualizeLightCount,
|
||||||
// Output
|
// Output
|
||||||
uniform unsigned int8 framebuffer_r[],
|
reference uniform unsigned int8 framebuffer_r[],
|
||||||
uniform unsigned int8 framebuffer_g[],
|
reference uniform unsigned int8 framebuffer_g[],
|
||||||
uniform unsigned int8 framebuffer_b[]) {
|
reference uniform unsigned int8 framebuffer_b[]) {
|
||||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||||
uniform int num_groups = num_groups_x * num_groups_y;
|
uniform int num_groups = num_groups_x * num_groups_y;
|
||||||
|
|
||||||
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
for (uniform int g = 0; g < num_groups; ++g)
|
||||||
// by MIN_TILE_HEIGHT pixels.
|
launch < RenderTile(g, num_groups_x, num_groups_y,
|
||||||
launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
|
inputHeader, inputData, visualizeLightCount,
|
||||||
inputHeader, inputData, visualizeLightCount,
|
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||||
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Routines for dynamic decomposition path
|
// Routines for dynamic decomposition path
|
||||||
|
|
||||||
// This computes the z min/max range for a whole row worth of tiles.
|
// tile width must be a multiple of programCount (SIMD size)
|
||||||
export void
|
export void
|
||||||
ComputeZBoundsRow(
|
ComputeZBoundsRow(
|
||||||
uniform int32 tileY,
|
uniform int32 tileY,
|
||||||
@@ -552,8 +565,8 @@ ComputeZBoundsRow(
|
|||||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||||
uniform float cameraNear, uniform float cameraFar,
|
uniform float cameraNear, uniform float cameraFar,
|
||||||
// Output
|
// Output
|
||||||
uniform float minZArray[],
|
reference uniform float minZArray[],
|
||||||
uniform float maxZArray[]
|
reference uniform float maxZArray[]
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
|
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
|
||||||
@@ -570,7 +583,6 @@ ComputeZBoundsRow(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
|
|
||||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||||
// should be able to handle programCount-sized load/stores.
|
// should be able to handle programCount-sized load/stores.
|
||||||
export void
|
export void
|
||||||
@@ -584,7 +596,7 @@ SplitTileMinMax(
|
|||||||
// Camera data
|
// Camera data
|
||||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||||
// Light Data
|
// Light Data
|
||||||
uniform int32 lightIndices[],
|
reference uniform int32 lightIndices[],
|
||||||
uniform int32 numLights,
|
uniform int32 numLights,
|
||||||
uniform float light_positionView_x_array[],
|
uniform float light_positionView_x_array[],
|
||||||
uniform float light_positionView_y_array[],
|
uniform float light_positionView_y_array[],
|
||||||
@@ -593,9 +605,9 @@ SplitTileMinMax(
|
|||||||
// Outputs
|
// Outputs
|
||||||
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
|
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
|
||||||
// indexing math ourselves
|
// indexing math ourselves
|
||||||
uniform int32 subtileIndices[],
|
reference uniform int32 subtileIndices[],
|
||||||
uniform int32 subtileIndicesPitch,
|
uniform int32 subtileIndicesPitch,
|
||||||
uniform int32 subtileNumLights[]
|
reference uniform int32 subtileNumLights[]
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||||
@@ -633,7 +645,12 @@ SplitTileMinMax(
|
|||||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||||
|
|
||||||
foreach (i = 0 ... numLights) {
|
for (int32 i = programIndex; i < numLights; i += programCount) {
|
||||||
|
// TODO: ISPC says gather required here when it actually
|
||||||
|
// isn't... this could be fixed this by nesting an if() within a
|
||||||
|
// uniform loop, but I'm not totally sure if that's a win
|
||||||
|
// overall. For now we'll just eat the perf cost for cleanliness
|
||||||
|
// since the below are real gathers anyways.
|
||||||
int32 lightIndex = lightIndices[i];
|
int32 lightIndex = lightIndices[i];
|
||||||
|
|
||||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||||
@@ -676,21 +693,21 @@ SplitTileMinMax(
|
|||||||
// Pack and store intersecting lights
|
// Pack and store intersecting lights
|
||||||
// TODO: Experiment with a loop here instead
|
// TODO: Experiment with a loop here instead
|
||||||
cif (inFrustum[0])
|
cif (inFrustum[0])
|
||||||
subtileLightOffset[0] +=
|
subtileLightOffset[0] += packed_store_active(subtileIndices,
|
||||||
packed_store_active(&subtileIndices[subtileLightOffset[0]],
|
subtileLightOffset[0],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
cif (inFrustum[1])
|
cif (inFrustum[1])
|
||||||
subtileLightOffset[1] +=
|
subtileLightOffset[1] += packed_store_active(subtileIndices,
|
||||||
packed_store_active(&subtileIndices[subtileLightOffset[1]],
|
subtileLightOffset[1],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
cif (inFrustum[2])
|
cif (inFrustum[2])
|
||||||
subtileLightOffset[2] +=
|
subtileLightOffset[2] += packed_store_active(subtileIndices,
|
||||||
packed_store_active(&subtileIndices[subtileLightOffset[2]],
|
subtileLightOffset[2],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
cif (inFrustum[3])
|
cif (inFrustum[3])
|
||||||
subtileLightOffset[3] +=
|
subtileLightOffset[3] += packed_store_active(subtileIndices,
|
||||||
packed_store_active(&subtileIndices[subtileLightOffset[3]],
|
subtileLightOffset[3],
|
||||||
lightIndex);
|
lightIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||||
|
|||||||
@@ -63,7 +63,7 @@
|
|||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
if (argc != 2) {
|
if (argc != 2) {
|
||||||
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
|
printf("usage: deferred_shading <input_file>\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,9 +77,9 @@ int main(int argc, char** argv) {
|
|||||||
input->header.framebufferHeight);
|
input->header.framebufferHeight);
|
||||||
|
|
||||||
InitDynamicC(input);
|
InitDynamicC(input);
|
||||||
#ifdef __cilk
|
#ifdef __cilkplusplus
|
||||||
InitDynamicCilk(input);
|
InitDynamicCilk(input);
|
||||||
#endif // __cilk
|
#endif // __cilkplusplus
|
||||||
|
|
||||||
int nframes = 5;
|
int nframes = 5;
|
||||||
double ispcCycles = 1e30;
|
double ispcCycles = 1e30;
|
||||||
@@ -98,21 +98,6 @@ int main(int argc, char** argv) {
|
|||||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||||
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||||
|
|
||||||
#ifdef __cilk
|
|
||||||
double dynamicCilkCycles = 1e30;
|
|
||||||
for (int i = 0; i < 5; ++i) {
|
|
||||||
framebuffer.clear();
|
|
||||||
reset_and_start_timer();
|
|
||||||
for (int j = 0; j < nframes; ++j)
|
|
||||||
DispatchDynamicCilk(input, &framebuffer);
|
|
||||||
double mcycles = get_elapsed_mcycles() / nframes;
|
|
||||||
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
|
||||||
}
|
|
||||||
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n",
|
|
||||||
dynamicCilkCycles);
|
|
||||||
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
|
||||||
#endif // __cilk
|
|
||||||
|
|
||||||
double serialCycles = 1e30;
|
double serialCycles = 1e30;
|
||||||
for (int i = 0; i < 5; ++i) {
|
for (int i = 0; i < 5; ++i) {
|
||||||
framebuffer.clear();
|
framebuffer.clear();
|
||||||
@@ -122,16 +107,29 @@ int main(int argc, char** argv) {
|
|||||||
double mcycles = get_elapsed_mcycles() / nframes;
|
double mcycles = get_elapsed_mcycles() / nframes;
|
||||||
serialCycles = std::min(serialCycles, mcycles);
|
serialCycles = std::min(serialCycles, mcycles);
|
||||||
}
|
}
|
||||||
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n",
|
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n",
|
||||||
serialCycles);
|
serialCycles);
|
||||||
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
||||||
|
|
||||||
#ifdef __cilk
|
#ifdef __cilkplusplus
|
||||||
|
double dynamicCilkCycles = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
framebuffer.clear();
|
||||||
|
reset_and_start_timer();
|
||||||
|
for (int j = 0; j < nframes; ++j)
|
||||||
|
DispatchDynamicCilk(input, &framebuffer);
|
||||||
|
double mcycles = get_elapsed_mcycles() / nframes;
|
||||||
|
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||||
|
}
|
||||||
|
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n",
|
||||||
|
dynamicCilkCycles);
|
||||||
|
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
||||||
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
||||||
#else
|
#else
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
||||||
#endif // __cilk
|
#endif // __cilkplusplus
|
||||||
|
|
||||||
DeleteInputData(input);
|
DeleteInputData(input);
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
CXX=g++ -m64
|
CXX=g++ -m64
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||||
|
|
||||||
default: mandelbrot
|
default: mandelbrot
|
||||||
|
|
||||||
@@ -14,17 +14,13 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ mandelbrot
|
/bin/rm -rf objs *~ mandelbrot
|
||||||
|
|
||||||
OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
|
mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
|
||||||
objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
|
$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
|
||||||
objs/mandelbrot_ispc.o
|
|
||||||
|
|
||||||
mandelbrot: dirs $(OBJS)
|
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
|
|
||||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -41,6 +41,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
#include "mandelbrot_ispc.h"
|
#include "mandelbrot_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
@@ -67,6 +68,38 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
unsigned int width = 768;
|
unsigned int width = 768;
|
||||||
unsigned int height = 512;
|
unsigned int height = 512;
|
||||||
@@ -78,6 +111,8 @@ int main() {
|
|||||||
int maxIterations = 256;
|
int maxIterations = 256;
|
||||||
int *buf = new int[width*height];
|
int *buf = new int[width*height];
|
||||||
|
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Compute the image using the ispc implementation; report the minimum
|
// Compute the image using the ispc implementation; report the minimum
|
||||||
// time of three runs.
|
// time of three runs.
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
|
|||||||
uniform float x1, uniform float y1,
|
uniform float x1, uniform float y1,
|
||||||
uniform int width, uniform int height,
|
uniform int width, uniform int height,
|
||||||
uniform int maxIterations,
|
uniform int maxIterations,
|
||||||
uniform int output[])
|
reference uniform int output[])
|
||||||
{
|
{
|
||||||
float dx = (x1 - x0) / width;
|
float dx = (x1 - x0) / width;
|
||||||
float dy = (y1 - y0) / height;
|
float dy = (y1 - y0) / height;
|
||||||
@@ -60,16 +60,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
|
|||||||
// Note that we'll be doing programCount computations in parallel,
|
// Note that we'll be doing programCount computations in parallel,
|
||||||
// so increment i by that much. This assumes that width evenly
|
// so increment i by that much. This assumes that width evenly
|
||||||
// divides programCount.
|
// divides programCount.
|
||||||
foreach (i = 0 ... width) {
|
for (uniform int i = 0; i < width; i += programCount) {
|
||||||
// Figure out the position on the complex plane to compute the
|
// Figure out the position on the complex plane to compute the
|
||||||
// number of iterations at. Note that the x values are
|
// number of iterations at. Note that the x values are
|
||||||
// different across different program instances, since its
|
// different across different program instances, since its
|
||||||
// initializer incorporates the value of the programIndex
|
// initializer incorporates the value of the programIndex
|
||||||
// variable.
|
// variable.
|
||||||
float x = x0 + i * dx;
|
float x = x0 + (programIndex + i) * dx;
|
||||||
float y = y0 + j * dy;
|
float y = y0 + j * dy;
|
||||||
|
|
||||||
int index = j * width + i;
|
int index = j * width + i + programIndex;
|
||||||
output[index] = mandel(x, y, maxIterations);
|
output[index] = mandel(x, y, maxIterations);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,19 +64,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -101,7 +96,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -119,7 +113,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -138,7 +131,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -155,18 +147,18 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="mandelbrot.ispc">
|
<CustomBuild Include="mandelbrot.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
|||||||
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
|||||||
CXX=g++
|
CXX=g++
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||||
|
|
||||||
OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
|
|
||||||
objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
|
|
||||||
objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o
|
|
||||||
|
|
||||||
default: mandelbrot
|
default: mandelbrot
|
||||||
|
|
||||||
@@ -24,8 +20,8 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ mandelbrot
|
/bin/rm -rf objs *~ mandelbrot
|
||||||
|
|
||||||
mandelbrot: dirs $(OBJS)
|
mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp
|
|||||||
|
|
||||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -42,6 +42,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
#include "mandelbrot_ispc.h"
|
#include "mandelbrot_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
@@ -68,6 +69,37 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void usage() {
|
static void usage() {
|
||||||
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
@@ -100,6 +132,8 @@ int main(int argc, char *argv[]) {
|
|||||||
else
|
else
|
||||||
usage();
|
usage();
|
||||||
|
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
int maxIterations = 512;
|
int maxIterations = 512;
|
||||||
int *buf = new int[width*height];
|
int *buf = new int[width*height];
|
||||||
|
|
||||||
|
|||||||
@@ -57,16 +57,18 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
|
|||||||
uniform float x0, uniform float dx,
|
uniform float x0, uniform float dx,
|
||||||
uniform float y0, uniform float dy,
|
uniform float y0, uniform float dy,
|
||||||
uniform int width, uniform int maxIterations,
|
uniform int width, uniform int maxIterations,
|
||||||
uniform int output[]) {
|
reference uniform int output[]) {
|
||||||
uniform int ystart = ybase + taskIndex * span;
|
uniform int ystart = ybase + taskIndex * span;
|
||||||
uniform int yend = ystart + span;
|
uniform int yend = ystart + span;
|
||||||
|
|
||||||
foreach (yi = ystart ... yend, xi = 0 ... width) {
|
for (uniform int j = ystart; j < yend; ++j) {
|
||||||
float x = x0 + xi * dx;
|
for (uniform int i = 0; i < width; i += programCount) {
|
||||||
float y = y0 + yi * dy;
|
float x = x0 + (programIndex + i) * dx;
|
||||||
|
float y = y0 + j * dy;
|
||||||
|
|
||||||
int index = yi * width + xi;
|
int index = j * width + i + programIndex;
|
||||||
output[index] = mandel(x, y, maxIterations);
|
output[index] = mandel(x, y, maxIterations);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -75,7 +77,7 @@ task void
|
|||||||
mandelbrot_chunk(uniform float x0, uniform float dx,
|
mandelbrot_chunk(uniform float x0, uniform float dx,
|
||||||
uniform float y0, uniform float dy,
|
uniform float y0, uniform float dy,
|
||||||
uniform int width, uniform int height,
|
uniform int width, uniform int height,
|
||||||
uniform int maxIterations, uniform int output[]) {
|
uniform int maxIterations, reference uniform int output[]) {
|
||||||
uniform int ystart = taskIndex * (height/taskCount);
|
uniform int ystart = taskIndex * (height/taskCount);
|
||||||
uniform int yend = (taskIndex+1) * (height/taskCount);
|
uniform int yend = (taskIndex+1) * (height/taskCount);
|
||||||
uniform int span = 1;
|
uniform int span = 1;
|
||||||
@@ -89,7 +91,7 @@ export void
|
|||||||
mandelbrot_ispc(uniform float x0, uniform float y0,
|
mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||||
uniform float x1, uniform float y1,
|
uniform float x1, uniform float y1,
|
||||||
uniform int width, uniform int height,
|
uniform int width, uniform int height,
|
||||||
uniform int maxIterations, uniform int output[]) {
|
uniform int maxIterations, reference uniform int output[]) {
|
||||||
uniform float dx = (x1 - x0) / width;
|
uniform float dx = (x1 - x0) / width;
|
||||||
uniform float dy = (y1 - y0) / height;
|
uniform float dy = (y1 - y0) / height;
|
||||||
|
|
||||||
|
|||||||
@@ -64,19 +64,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -101,7 +96,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -119,7 +113,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -138,7 +131,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -156,18 +148,18 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="mandelbrot.ispc">
|
<CustomBuild Include="mandelbrot.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
|||||||
@@ -2,10 +2,7 @@
|
|||||||
CXX=g++ -m64
|
CXX=g++ -m64
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||||
|
|
||||||
OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
|
|
||||||
objs/noise_ispc_sse4.o objs/noise_ispc_avx.o
|
|
||||||
|
|
||||||
default: noise
|
default: noise
|
||||||
|
|
||||||
@@ -17,13 +14,13 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ noise
|
/bin/rm -rf objs *~ noise
|
||||||
|
|
||||||
noise: dirs $(OBJS)
|
noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
|
|
||||||
objs/noise.o: objs/noise_ispc.h
|
objs/noise.o: objs/noise_ispc.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -41,6 +41,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
#include "noise_ispc.h"
|
#include "noise_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
@@ -65,6 +66,38 @@ writePPM(float *buf, int width, int height, const char *fn) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
unsigned int width = 768;
|
unsigned int width = 768;
|
||||||
unsigned int height = 768;
|
unsigned int height = 768;
|
||||||
@@ -75,6 +108,8 @@ int main() {
|
|||||||
|
|
||||||
float *buf = new float[width*height];
|
float *buf = new float[width*height];
|
||||||
|
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Compute the image using the ispc implementation; report the minimum
|
// Compute the image using the ispc implementation; report the minimum
|
||||||
// time of three runs.
|
// time of three runs.
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup Label="ProjectConfigurations">
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
<ProjectConfiguration Include="Debug|Win32">
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
@@ -64,19 +64,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -101,7 +96,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -119,7 +113,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -138,7 +131,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -155,18 +147,18 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="noise.ispc">
|
<CustomBuild Include="noise.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
|||||||
@@ -1,17 +1,8 @@
|
|||||||
|
|
||||||
TASK_CXX=../tasksys.cpp
|
|
||||||
TASK_LIB=-lpthread
|
|
||||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
|
||||||
|
|
||||||
|
|
||||||
CXX=g++ -m64
|
CXX=g++ -m64
|
||||||
CXXFLAGS=-Iobjs/ -g -Wall
|
CXXFLAGS=-Iobjs/ -g -Wall
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||||
|
|
||||||
OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
|
|
||||||
objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
|
|
||||||
objs/options_ispc_avx.o $(TASK_OBJ)
|
|
||||||
|
|
||||||
default: options
|
default: options
|
||||||
|
|
||||||
@@ -23,16 +14,13 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ options
|
/bin/rm -rf objs *~ options
|
||||||
|
|
||||||
options: dirs $(OBJS)
|
options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
|
|
||||||
objs/%.o: ../%.cpp
|
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
|
||||||
|
|
||||||
objs/options.o: objs/options_ispc.h options_defs.h
|
objs/options.o: objs/options_ispc.h options_defs.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
|
objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -31,8 +31,6 @@
|
|||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define NOMINMAX
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -43,6 +41,7 @@ using std::max;
|
|||||||
|
|
||||||
#include "options_defs.h"
|
#include "options_defs.h"
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
|
|
||||||
#include "options_ispc.h"
|
#include "options_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
@@ -55,32 +54,49 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
|||||||
float ra[], float va[],
|
float ra[], float va[],
|
||||||
float result[], int count);
|
float result[], int count);
|
||||||
|
|
||||||
static void usage() {
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
printf("usage: options [--count=<num options>]\n");
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main() {
|
||||||
int nOptions = 128*1024;
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
|
float *S = new float[N_OPTIONS];
|
||||||
|
float *X = new float[N_OPTIONS];
|
||||||
|
float *T = new float[N_OPTIONS];
|
||||||
|
float *r = new float[N_OPTIONS];
|
||||||
|
float *v = new float[N_OPTIONS];
|
||||||
|
float *result = new float[N_OPTIONS];
|
||||||
|
|
||||||
for (int i = 1; i < argc; ++i) {
|
for (int i = 0; i < N_OPTIONS; ++i) {
|
||||||
if (strncmp(argv[i], "--count=", 8) == 0) {
|
|
||||||
nOptions = atoi(argv[i] + 8);
|
|
||||||
if (nOptions <= 0) {
|
|
||||||
usage();
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float *S = new float[nOptions];
|
|
||||||
float *X = new float[nOptions];
|
|
||||||
float *T = new float[nOptions];
|
|
||||||
float *r = new float[nOptions];
|
|
||||||
float *v = new float[nOptions];
|
|
||||||
float *result = new float[nOptions];
|
|
||||||
|
|
||||||
for (int i = 0; i < nOptions; ++i) {
|
|
||||||
S[i] = 100; // stock price
|
S[i] = 100; // stock price
|
||||||
X[i] = 98; // option strike price
|
X[i] = 98; // option strike price
|
||||||
T[i] = 2; // time (years)
|
T[i] = 2; // time (years)
|
||||||
@@ -88,109 +104,61 @@ int main(int argc, char *argv[]) {
|
|||||||
v[i] = 5; // volatility
|
v[i] = 5; // volatility
|
||||||
}
|
}
|
||||||
|
|
||||||
double sum;
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Binomial options pricing model, ispc implementation
|
// Binomial options pricing model, ispc implementation
|
||||||
//
|
//
|
||||||
double binomial_ispc = 1e30;
|
reset_and_start_timer();
|
||||||
for (int i = 0; i < 3; ++i) {
|
binomial_put_ispc(S, X, T, r, v, result, N_OPTIONS);
|
||||||
reset_and_start_timer();
|
double binomial_ispc = get_elapsed_mcycles();
|
||||||
binomial_put_ispc(S, X, T, r, v, result, nOptions);
|
float sum = 0.f;
|
||||||
double dt = get_elapsed_mcycles();
|
for (int i = 0; i < N_OPTIONS; ++i)
|
||||||
sum = 0.;
|
sum += result[i];
|
||||||
for (int i = 0; i < nOptions; ++i)
|
printf("[binomial ispc]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||||
sum += result[i];
|
binomial_ispc, sum / N_OPTIONS);
|
||||||
binomial_ispc = std::min(binomial_ispc, dt);
|
|
||||||
}
|
|
||||||
printf("[binomial ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n",
|
|
||||||
binomial_ispc, sum / nOptions);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Binomial options pricing model, ispc implementation, tasks
|
|
||||||
//
|
|
||||||
double binomial_tasks = 1e30;
|
|
||||||
for (int i = 0; i < 3; ++i) {
|
|
||||||
reset_and_start_timer();
|
|
||||||
binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
|
|
||||||
double dt = get_elapsed_mcycles();
|
|
||||||
sum = 0.;
|
|
||||||
for (int i = 0; i < nOptions; ++i)
|
|
||||||
sum += result[i];
|
|
||||||
binomial_tasks = std::min(binomial_tasks, dt);
|
|
||||||
}
|
|
||||||
printf("[binomial ispc, tasks]:\t\t[%.3f] million cycles (avg %f)\n",
|
|
||||||
binomial_tasks, sum / nOptions);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Binomial options, serial implementation
|
// Binomial options, serial implementation
|
||||||
//
|
//
|
||||||
double binomial_serial = 1e30;
|
reset_and_start_timer();
|
||||||
for (int i = 0; i < 3; ++i) {
|
binomial_put_serial(S, X, T, r, v, result, N_OPTIONS);
|
||||||
reset_and_start_timer();
|
double binomial_serial = get_elapsed_mcycles();
|
||||||
binomial_put_serial(S, X, T, r, v, result, nOptions);
|
sum = 0.f;
|
||||||
double dt = get_elapsed_mcycles();
|
for (int i = 0; i < N_OPTIONS; ++i)
|
||||||
sum = 0.;
|
sum += result[i];
|
||||||
for (int i = 0; i < nOptions; ++i)
|
|
||||||
sum += result[i];
|
|
||||||
binomial_serial = std::min(binomial_serial, dt);
|
|
||||||
}
|
|
||||||
printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n",
|
printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||||
binomial_serial, sum / nOptions);
|
binomial_serial, sum / N_OPTIONS);
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", binomial_serial / binomial_ispc);
|
||||||
binomial_serial / binomial_ispc, binomial_serial / binomial_tasks);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Black-Scholes options pricing model, ispc implementation, 1 thread
|
// Black-Scholes options pricing model, ispc implementation
|
||||||
//
|
//
|
||||||
double bs_ispc = 1e30;
|
sum = 0.f;
|
||||||
for (int i = 0; i < 3; ++i) {
|
reset_and_start_timer();
|
||||||
reset_and_start_timer();
|
for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
|
||||||
black_scholes_ispc(S, X, T, r, v, result, nOptions);
|
black_scholes_ispc(S, X, T, r, v, result, N_OPTIONS);
|
||||||
double dt = get_elapsed_mcycles();
|
for (int i = 0; i < N_OPTIONS; ++i)
|
||||||
sum = 0.;
|
|
||||||
for (int i = 0; i < nOptions; ++i)
|
|
||||||
sum += result[i];
|
sum += result[i];
|
||||||
bs_ispc = std::min(bs_ispc, dt);
|
|
||||||
}
|
}
|
||||||
printf("[black-scholes ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n",
|
double bs_ispc = get_elapsed_mcycles();
|
||||||
bs_ispc, sum / nOptions);
|
printf("[black-scholes ispc]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||||
|
bs_ispc, sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
|
||||||
//
|
|
||||||
// Black-Scholes options pricing model, ispc implementation, tasks
|
|
||||||
//
|
|
||||||
double bs_ispc_tasks = 1e30;
|
|
||||||
for (int i = 0; i < 3; ++i) {
|
|
||||||
reset_and_start_timer();
|
|
||||||
black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
|
|
||||||
double dt = get_elapsed_mcycles();
|
|
||||||
sum = 0.;
|
|
||||||
for (int i = 0; i < nOptions; ++i)
|
|
||||||
sum += result[i];
|
|
||||||
bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
|
|
||||||
}
|
|
||||||
printf("[black-scholes ispc, tasks]:\t[%.3f] million cycles (avg %f)\n",
|
|
||||||
bs_ispc_tasks, sum / nOptions);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Black-Scholes options pricing model, serial implementation
|
// Black-Scholes options pricing model, serial implementation
|
||||||
//
|
//
|
||||||
double bs_serial = 1e30;
|
sum = 0.f;
|
||||||
for (int i = 0; i < 3; ++i) {
|
reset_and_start_timer();
|
||||||
reset_and_start_timer();
|
for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
|
||||||
black_scholes_serial(S, X, T, r, v, result, nOptions);
|
black_scholes_serial(S, X, T, r, v, result, N_OPTIONS);
|
||||||
double dt = get_elapsed_mcycles();
|
for (int i = 0; i < N_OPTIONS; ++i)
|
||||||
sum = 0.;
|
|
||||||
for (int i = 0; i < nOptions; ++i)
|
|
||||||
sum += result[i];
|
sum += result[i];
|
||||||
bs_serial = std::min(bs_serial, dt);
|
|
||||||
}
|
}
|
||||||
|
double bs_serial = get_elapsed_mcycles();
|
||||||
printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial,
|
printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial,
|
||||||
sum / nOptions);
|
sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", bs_serial / bs_ispc);
|
||||||
bs_serial / bs_ispc, bs_serial / bs_ispc_tasks);
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -55,100 +55,49 @@ CND(float X) {
|
|||||||
return w;
|
return w;
|
||||||
}
|
}
|
||||||
|
|
||||||
task void
|
|
||||||
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
|
||||||
uniform float ra[], uniform float va[],
|
|
||||||
uniform float result[], uniform int count) {
|
|
||||||
uniform int first = taskIndex * (count/taskCount);
|
|
||||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
|
||||||
|
|
||||||
foreach (i = first ... last) {
|
|
||||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
|
||||||
|
|
||||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
|
||||||
float d2 = d1 - v * sqrt(T);
|
|
||||||
|
|
||||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export void
|
|
||||||
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
|
||||||
uniform float ra[], uniform float va[],
|
|
||||||
uniform float result[], uniform int count) {
|
|
||||||
uniform int nTasks = max((int)64, (int)count/16384);
|
|
||||||
launch[nTasks] < bs_task(Sa, Xa, Ta, ra, va, result, count) >;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
export void
|
export void
|
||||||
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||||
uniform float ra[], uniform float va[],
|
uniform float ra[], uniform float va[],
|
||||||
uniform float result[], uniform int count) {
|
uniform float result[], uniform int count) {
|
||||||
foreach (i = 0 ... count) {
|
for (uniform int i = 0; i < count; i += programCount) {
|
||||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
float S = Sa[i + programIndex], X = Xa[i + programIndex];
|
||||||
|
float T = Ta[i + programIndex], r = ra[i + programIndex];
|
||||||
|
float v = va[i + programIndex];
|
||||||
|
|
||||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
||||||
float d2 = d1 - v * sqrt(T);
|
float d2 = d1 - v * sqrt(T);
|
||||||
|
|
||||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline float
|
|
||||||
binomial_put(float S, float X, float T, float r, float v) {
|
|
||||||
float V[BINOMIAL_NUM];
|
|
||||||
|
|
||||||
float dt = T / BINOMIAL_NUM;
|
|
||||||
float u = exp(v * sqrt(dt));
|
|
||||||
float d = 1. / u;
|
|
||||||
float disc = exp(r * dt);
|
|
||||||
float Pu = (disc - d) / (u - d);
|
|
||||||
|
|
||||||
for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
|
|
||||||
float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
|
|
||||||
V[j] = max(0., X - S * upow);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
|
|
||||||
for (uniform int k = 0; k < j; ++k)
|
|
||||||
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
|
|
||||||
return V[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
export void
|
export void
|
||||||
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||||
uniform float ra[], uniform float va[],
|
uniform float ra[], uniform float va[],
|
||||||
uniform float result[], uniform int count) {
|
uniform float result[], uniform int count) {
|
||||||
foreach (i = 0 ... count) {
|
float V[BINOMIAL_NUM];
|
||||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
|
||||||
result[i] = binomial_put(S, X, T, r, v);
|
for (uniform int i = 0; i < count; i += programCount) {
|
||||||
|
float S = Sa[i + programIndex], X = Xa[i + programIndex];
|
||||||
|
float T = Ta[i + programIndex], r = ra[i + programIndex];
|
||||||
|
float v = va[i + programIndex];
|
||||||
|
|
||||||
|
float dt = T / BINOMIAL_NUM;
|
||||||
|
float u = exp(v * sqrt(dt));
|
||||||
|
float d = 1. / u;
|
||||||
|
float disc = exp(r * dt);
|
||||||
|
float Pu = (disc - d) / (u - d);
|
||||||
|
|
||||||
|
for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
|
||||||
|
float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
|
||||||
|
V[j] = max(0., X - S * upow);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
|
||||||
|
for (uniform int k = 0; k < j; ++k)
|
||||||
|
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
|
||||||
|
|
||||||
|
result[i + programIndex] = V[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
task void
|
|
||||||
binomial_task(uniform float Sa[], uniform float Xa[],
|
|
||||||
uniform float Ta[], uniform float ra[],
|
|
||||||
uniform float va[], uniform float result[],
|
|
||||||
uniform int count) {
|
|
||||||
uniform int first = taskIndex * (count/taskCount);
|
|
||||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
|
||||||
|
|
||||||
foreach (i = first ... last) {
|
|
||||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
|
||||||
result[i] = binomial_put(S, X, T, r, v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
export void
|
|
||||||
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
|
|
||||||
uniform float Ta[], uniform float ra[],
|
|
||||||
uniform float va[], uniform float result[],
|
|
||||||
uniform int count) {
|
|
||||||
uniform int nTasks = max((int)64, (int)count/16384);
|
|
||||||
launch[nTasks] < binomial_task(Sa, Xa, Ta, ra, va, result, count) >;
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup Label="ProjectConfigurations">
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
<ProjectConfiguration Include="Debug|Win32">
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
@@ -64,19 +64,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
@@ -102,7 +97,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
@@ -121,7 +115,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -141,7 +134,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -155,23 +147,22 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="options.cpp" />
|
<ClCompile Include="options.cpp" />
|
||||||
<ClCompile Include="options_serial.cpp" />
|
<ClCompile Include="options_serial.cpp" />
|
||||||
<ClCompile Include="../tasksys.cpp" />
|
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="options.ispc">
|
<CustomBuild Include="options.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -35,6 +35,8 @@
|
|||||||
#define OPTIONS_DEFS_H 1
|
#define OPTIONS_DEFS_H 1
|
||||||
|
|
||||||
#define BINOMIAL_NUM 64
|
#define BINOMIAL_NUM 64
|
||||||
|
#define N_OPTIONS 65536
|
||||||
|
#define N_BLACK_SCHOLES_ROUNDS 20
|
||||||
|
|
||||||
|
|
||||||
#endif // OPTIONS_DEFS_H
|
#endif // OPTIONS_DEFS_H
|
||||||
|
|||||||
@@ -8,10 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
|||||||
CXX=g++
|
CXX=g++
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||||
|
|
||||||
OBJS=objs/rt.o objs/rt_serial.o $(TASK_OBJ) objs/rt_ispc.o objs/rt_ispc_sse2.o \
|
|
||||||
objs/rt_ispc_sse4.o objs/rt_ispc_avx.o
|
|
||||||
|
|
||||||
default: rt
|
default: rt
|
||||||
|
|
||||||
@@ -23,8 +20,8 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ rt
|
/bin/rm -rf objs *~ rt
|
||||||
|
|
||||||
rt: dirs $(OBJS)
|
rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
@@ -34,5 +31,5 @@ objs/%.o: ../%.cpp
|
|||||||
|
|
||||||
objs/rt.o: objs/rt_ispc.h
|
objs/rt.o: objs/rt_ispc.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -45,6 +45,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
#include "rt_ispc.h"
|
#include "rt_ispc.h"
|
||||||
|
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
@@ -95,6 +96,38 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void usage() {
|
static void usage() {
|
||||||
fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
|
fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
@@ -118,6 +151,8 @@ int main(int argc, char *argv[]) {
|
|||||||
if (filename == NULL)
|
if (filename == NULL)
|
||||||
usage();
|
usage();
|
||||||
|
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
#define READ(var, n) \
|
#define READ(var, n) \
|
||||||
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
|
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
|
||||||
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
|
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
|
||||||
@@ -168,12 +203,12 @@ int main(int argc, char *argv[]) {
|
|||||||
// of node, the total number of int it if a leaf node, etc.
|
// of node, the total number of int it if a leaf node, etc.
|
||||||
float b[6];
|
float b[6];
|
||||||
READ(b[0], 6);
|
READ(b[0], 6);
|
||||||
nodes[i].bounds[0][0] = b[0];
|
nodes[i].bounds[0].v[0] = b[0];
|
||||||
nodes[i].bounds[0][1] = b[1];
|
nodes[i].bounds[0].v[1] = b[1];
|
||||||
nodes[i].bounds[0][2] = b[2];
|
nodes[i].bounds[0].v[2] = b[2];
|
||||||
nodes[i].bounds[1][0] = b[3];
|
nodes[i].bounds[1].v[0] = b[3];
|
||||||
nodes[i].bounds[1][1] = b[4];
|
nodes[i].bounds[1].v[1] = b[4];
|
||||||
nodes[i].bounds[1][2] = b[5];
|
nodes[i].bounds[1].v[2] = b[5];
|
||||||
READ(nodes[i].offset, 1);
|
READ(nodes[i].offset, 1);
|
||||||
READ(nodes[i].nPrimitives, 1);
|
READ(nodes[i].nPrimitives, 1);
|
||||||
READ(nodes[i].splitAxis, 1);
|
READ(nodes[i].splitAxis, 1);
|
||||||
@@ -190,17 +225,19 @@ int main(int argc, char *argv[]) {
|
|||||||
READ(v[0], 9);
|
READ(v[0], 9);
|
||||||
float *vp = v;
|
float *vp = v;
|
||||||
for (int j = 0; j < 3; ++j) {
|
for (int j = 0; j < 3; ++j) {
|
||||||
triangles[i].p[j][0] = *vp++;
|
triangles[i].p[j].v[0] = *vp++;
|
||||||
triangles[i].p[j][1] = *vp++;
|
triangles[i].p[j].v[1] = *vp++;
|
||||||
triangles[i].p[j][2] = *vp++;
|
triangles[i].p[j].v[2] = *vp++;
|
||||||
}
|
}
|
||||||
// And create an object id
|
// And create an object id
|
||||||
triangles[i].id = i+1;
|
triangles[i].id = i+1;
|
||||||
}
|
}
|
||||||
fclose(f);
|
fclose(f);
|
||||||
|
|
||||||
int height = int(baseHeight * scale);
|
// round image resolution up to multiple of 16 to make things easy for
|
||||||
int width = int(baseWidth * scale);
|
// the code that assigns pixels to ispc program instances
|
||||||
|
int height = (int(baseHeight * scale) + 0xf) & ~0xf;
|
||||||
|
int width = (int(baseWidth * scale) + 0xf) & ~0xf;
|
||||||
|
|
||||||
// allocate images; one to hold hit object ids, one to hold depth to
|
// allocate images; one to hold hit object ids, one to hold depth to
|
||||||
// the first interseciton
|
// the first interseciton
|
||||||
|
|||||||
@@ -43,13 +43,12 @@ struct Ray {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct Triangle {
|
struct Triangle {
|
||||||
uniform float p[3][4];
|
uniform float3 p[3];
|
||||||
uniform int id;
|
uniform int id;
|
||||||
uniform int pad[3];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LinearBVHNode {
|
struct LinearBVHNode {
|
||||||
uniform float bounds[2][3];
|
uniform float3 bounds[2];
|
||||||
uniform unsigned int offset; // num primitives for leaf, second child for interior
|
uniform unsigned int offset; // num primitives for leaf, second child for interior
|
||||||
uniform unsigned int8 nPrimitives;
|
uniform unsigned int8 nPrimitives;
|
||||||
uniform unsigned int8 splitAxis;
|
uniform unsigned int8 splitAxis;
|
||||||
@@ -73,7 +72,7 @@ static inline float Dot(const float3 a, const float3 b) {
|
|||||||
|
|
||||||
static void generateRay(uniform const float raster2camera[4][4],
|
static void generateRay(uniform const float raster2camera[4][4],
|
||||||
uniform const float camera2world[4][4],
|
uniform const float camera2world[4][4],
|
||||||
float x, float y, Ray &ray) {
|
float x, float y, reference Ray ray) {
|
||||||
ray.mint = 0.f;
|
ray.mint = 0.f;
|
||||||
ray.maxt = 1e30f;
|
ray.maxt = 1e30f;
|
||||||
|
|
||||||
@@ -104,16 +103,14 @@ static void generateRay(uniform const float raster2camera[4][4],
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
static inline bool BBoxIntersect(const reference uniform float3 bounds[2],
|
||||||
const Ray &ray) {
|
const reference Ray ray) {
|
||||||
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
|
||||||
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
|
||||||
float t0 = ray.mint, t1 = ray.maxt;
|
float t0 = ray.mint, t1 = ray.maxt;
|
||||||
|
|
||||||
// Check all three axis-aligned slabs. Don't try to early out; it's
|
// Check all three axis-aligned slabs. Don't try to early out; it's
|
||||||
// not worth the trouble
|
// not worth the trouble
|
||||||
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
|
float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
|
||||||
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
|
float3 tFar = (bounds[1] - ray.origin) * ray.invDir;
|
||||||
if (tNear.x > tFar.x) {
|
if (tNear.x > tFar.x) {
|
||||||
float tmp = tNear.x;
|
float tmp = tNear.x;
|
||||||
tNear.x = tFar.x;
|
tNear.x = tFar.x;
|
||||||
@@ -143,12 +140,9 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
|
||||||
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
uniform float3 e1 = tri.p[1] - tri.p[0];
|
||||||
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
uniform float3 e2 = tri.p[2] - tri.p[0];
|
||||||
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
|
||||||
uniform float3 e1 = p1 - p0;
|
|
||||||
uniform float3 e2 = p2 - p0;
|
|
||||||
|
|
||||||
float3 s1 = Cross(ray.dir, e2);
|
float3 s1 = Cross(ray.dir, e2);
|
||||||
float divisor = Dot(s1, e1);
|
float divisor = Dot(s1, e1);
|
||||||
@@ -159,7 +153,7 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
|||||||
float invDivisor = 1.f / divisor;
|
float invDivisor = 1.f / divisor;
|
||||||
|
|
||||||
// Compute first barycentric coordinate
|
// Compute first barycentric coordinate
|
||||||
float3 d = ray.origin - p0;
|
float3 d = ray.origin - tri.p[0];
|
||||||
float b1 = Dot(d, s1) * invDivisor;
|
float b1 = Dot(d, s1) * invDivisor;
|
||||||
if (b1 < 0. || b1 > 1.)
|
if (b1 < 0. || b1 > 1.)
|
||||||
hit = false;
|
hit = false;
|
||||||
@@ -184,7 +178,7 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
|||||||
|
|
||||||
|
|
||||||
bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||||
Ray &r) {
|
reference Ray r) {
|
||||||
Ray ray = r;
|
Ray ray = r;
|
||||||
bool hit = false;
|
bool hit = false;
|
||||||
// Follow ray through BVH nodes to find primitive intersections
|
// Follow ray through BVH nodes to find primitive intersections
|
||||||
@@ -244,15 +238,34 @@ static void raytrace_tile(uniform int x0, uniform int x1,
|
|||||||
uniform float widthScale = (float)(baseWidth) / (float)(width);
|
uniform float widthScale = (float)(baseWidth) / (float)(width);
|
||||||
uniform float heightScale = (float)(baseHeight) / (float)(height);
|
uniform float heightScale = (float)(baseHeight) / (float)(height);
|
||||||
|
|
||||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
|
static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||||
Ray ray;
|
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||||
generateRay(raster2camera, camera2world, x*widthScale,
|
static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||||
y*heightScale, ray);
|
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||||
BVHIntersect(nodes, triangles, ray);
|
|
||||||
|
|
||||||
int offset = y * width + x;
|
// The outer loops are always over blocks of 4x4 pixels
|
||||||
image[offset] = ray.maxt;
|
for (uniform int y = y0; y < y1; y += 4) {
|
||||||
id[offset] = ray.hitId;
|
for (uniform int x = x0; x < x1; x += 4) {
|
||||||
|
// Now we have a block of 4x4=16 pixels to process; it will
|
||||||
|
// take 16/programCount iterations of this loop to process
|
||||||
|
// them.
|
||||||
|
for (uniform int o = 0; o < 16 / programCount; ++o) {
|
||||||
|
// Map program instances to samples in the udx/udy arrays
|
||||||
|
// to figure out which pixel each program instance is
|
||||||
|
// responsible for
|
||||||
|
const float dx = udx[o * programCount + programIndex];
|
||||||
|
const float dy = udy[o * programCount + programIndex];
|
||||||
|
|
||||||
|
Ray ray;
|
||||||
|
generateRay(raster2camera, camera2world, (x+dx)*widthScale,
|
||||||
|
(y+dy)*heightScale, ray);
|
||||||
|
BVHIntersect(nodes, triangles, ray);
|
||||||
|
|
||||||
|
int offset = (y + (int)dy) * width + (x + (int)dx);
|
||||||
|
image[offset] = ray.maxt;
|
||||||
|
id[offset] = ray.hitId;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -270,19 +283,19 @@ export void raytrace_ispc(uniform int width, uniform int height,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
task void raytrace_tile_task(uniform int width, uniform int height,
|
task void raytrace_tile_task(uniform int y0, uniform int y1,
|
||||||
|
uniform int width, uniform int height,
|
||||||
uniform int baseWidth, uniform int baseHeight,
|
uniform int baseWidth, uniform int baseHeight,
|
||||||
const uniform float raster2camera[4][4],
|
const uniform float raster2camera[4][4],
|
||||||
const uniform float camera2world[4][4],
|
const uniform float camera2world[4][4],
|
||||||
uniform float image[], uniform int id[],
|
uniform float image[], uniform int id[],
|
||||||
const LinearBVHNode nodes[],
|
const LinearBVHNode nodes[],
|
||||||
const Triangle triangles[]) {
|
const Triangle triangles[]) {
|
||||||
uniform int dx = 16, dy = 16; // must match dx, dy below
|
uniform int dx = 16; // must match dx below
|
||||||
uniform int xBuckets = (width + (dx-1)) / dx;
|
uniform int xTasks = (width + (dx-1)) / dx;
|
||||||
uniform int x0 = (taskIndex % xBuckets) * dx;
|
uniform int x0 = (taskIndex % xTasks) * dx;
|
||||||
uniform int x1 = min(x0 + dx, width);
|
uniform int x1 = x0 + dx;
|
||||||
uniform int y0 = (taskIndex / xBuckets) * dy;
|
x1 = min(x1, width);
|
||||||
uniform int y1 = min(y0 + dy, height);
|
|
||||||
|
|
||||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||||
raster2camera, camera2world, image,
|
raster2camera, camera2world, image,
|
||||||
@@ -298,11 +311,11 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
|||||||
const LinearBVHNode nodes[],
|
const LinearBVHNode nodes[],
|
||||||
const Triangle triangles[]) {
|
const Triangle triangles[]) {
|
||||||
uniform int dx = 16, dy = 16;
|
uniform int dx = 16, dy = 16;
|
||||||
uniform int xBuckets = (width + (dx-1)) / dx;
|
uniform int nTasks = (width + (dx-1)) / dx;
|
||||||
uniform int yBuckets = (height + (dy-1)) / dy;
|
for (uniform int y = 0; y < height; y += dy) {
|
||||||
uniform int nTasks = xBuckets * yBuckets;
|
uniform int y1 = min(y + dy, height);
|
||||||
launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight,
|
launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth,
|
||||||
raster2camera, camera2world,
|
baseHeight, raster2camera, camera2world,
|
||||||
image, id, nodes, triangles) >;
|
image, id, nodes, triangles) >;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -64,19 +64,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -101,7 +96,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -119,7 +113,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -138,7 +131,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -152,21 +144,21 @@
|
|||||||
<CustomBuild Include="rt.ispc">
|
<CustomBuild Include="rt.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -75,13 +75,12 @@ struct Ray {
|
|||||||
// Declare these in a namespace so the mangling matches
|
// Declare these in a namespace so the mangling matches
|
||||||
namespace ispc {
|
namespace ispc {
|
||||||
struct Triangle {
|
struct Triangle {
|
||||||
float p[3][4]; // extra float pad after each vertex
|
float3 p[3];
|
||||||
int32_t id;
|
int32_t id;
|
||||||
int32_t pad[3]; // make 16 x 32-bits
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LinearBVHNode {
|
struct LinearBVHNode {
|
||||||
float bounds[2][3];
|
float3 bounds[2];
|
||||||
int32_t offset; // primitives for leaf, second child for interior
|
int32_t offset; // primitives for leaf, second child for interior
|
||||||
uint8_t nPrimitives;
|
uint8_t nPrimitives;
|
||||||
uint8_t splitAxis;
|
uint8_t splitAxis;
|
||||||
@@ -141,14 +140,12 @@ static void generateRay(const float raster2camera[4][4],
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline bool BBoxIntersect(const float bounds[2][3],
|
static inline bool BBoxIntersect(const float3 bounds[2],
|
||||||
const Ray &ray) {
|
const Ray &ray) {
|
||||||
float3 bounds0(bounds[0][0], bounds[0][1], bounds[0][2]);
|
|
||||||
float3 bounds1(bounds[1][0], bounds[1][1], bounds[1][2]);
|
|
||||||
float t0 = ray.mint, t1 = ray.maxt;
|
float t0 = ray.mint, t1 = ray.maxt;
|
||||||
|
|
||||||
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
|
float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
|
||||||
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
|
float3 tFar = (bounds[1] - ray.origin) * ray.invDir;
|
||||||
if (tNear.x > tFar.x) {
|
if (tNear.x > tFar.x) {
|
||||||
float tmp = tNear.x;
|
float tmp = tNear.x;
|
||||||
tNear.x = tFar.x;
|
tNear.x = tFar.x;
|
||||||
@@ -179,11 +176,8 @@ static inline bool BBoxIntersect(const float bounds[2][3],
|
|||||||
|
|
||||||
|
|
||||||
inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||||
float3 p0(tri.p[0][0], tri.p[0][1], tri.p[0][2]);
|
float3 e1 = tri.p[1] - tri.p[0];
|
||||||
float3 p1(tri.p[1][0], tri.p[1][1], tri.p[1][2]);
|
float3 e2 = tri.p[2] - tri.p[0];
|
||||||
float3 p2(tri.p[2][0], tri.p[2][1], tri.p[2][2]);
|
|
||||||
float3 e1 = p1 - p0;
|
|
||||||
float3 e2 = p2 - p0;
|
|
||||||
|
|
||||||
float3 s1 = Cross(ray.dir, e2);
|
float3 s1 = Cross(ray.dir, e2);
|
||||||
float divisor = Dot(s1, e1);
|
float divisor = Dot(s1, e1);
|
||||||
@@ -193,7 +187,7 @@ inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
|||||||
float invDivisor = 1.f / divisor;
|
float invDivisor = 1.f / divisor;
|
||||||
|
|
||||||
// Compute first barycentric coordinate
|
// Compute first barycentric coordinate
|
||||||
float3 d = ray.origin - p0;
|
float3 d = ray.origin - tri.p[0];
|
||||||
float b1 = Dot(d, s1) * invDivisor;
|
float b1 = Dot(d, s1) * invDivisor;
|
||||||
if (b1 < 0. || b1 > 1.)
|
if (b1 < 0. || b1 > 1.)
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
CXX=g++ -m64
|
CXX=g++ -m64
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --arch=x86-64 --target=sse2
|
ISPCFLAGS=-O2 --arch=x86-64
|
||||||
|
|
||||||
default: simple
|
default: simple
|
||||||
|
|
||||||
|
|||||||
@@ -33,12 +33,47 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include "../cpuid.h"
|
||||||
|
|
||||||
// Include the header file that the ispc compiler generates
|
// Include the header file that the ispc compiler generates
|
||||||
#include "simple_ispc.h"
|
#include "simple_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
float vin[16], vout[16];
|
float vin[16], vout[16];
|
||||||
|
|
||||||
// Initialize input buffer
|
// Initialize input buffer
|
||||||
|
|||||||
@@ -34,7 +34,9 @@
|
|||||||
|
|
||||||
export void simple(uniform float vin[], uniform float vout[],
|
export void simple(uniform float vin[], uniform float vout[],
|
||||||
uniform int count) {
|
uniform int count) {
|
||||||
foreach (index = 0 ... count) {
|
// Compute the result for 'programCount' values in parallel
|
||||||
|
for (uniform int i = 0; i < count; i += programCount) {
|
||||||
|
int index = i + programIndex;
|
||||||
// Load the appropriate input value for this program instance.
|
// Load the appropriate input value for this program instance.
|
||||||
float v = vin[index];
|
float v = vin[index];
|
||||||
|
|
||||||
|
|||||||
@@ -25,21 +25,21 @@
|
|||||||
<CustomBuild Include="simple.ispc">
|
<CustomBuild Include="simple.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<PropertyGroup Label="Globals">
|
<PropertyGroup Label="Globals">
|
||||||
@@ -88,19 +88,15 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -109,7 +105,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -123,7 +118,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -139,7 +133,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -157,7 +150,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -169,4 +161,4 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
|||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|||||||
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
|||||||
CXX=g++
|
CXX=g++
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||||
|
|
||||||
OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
|
|
||||||
objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
|
|
||||||
objs/stencil_ispc_avx.o
|
|
||||||
|
|
||||||
default: stencil
|
default: stencil
|
||||||
|
|
||||||
@@ -24,8 +20,8 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ stencil
|
/bin/rm -rf objs *~ stencil
|
||||||
|
|
||||||
stencil: dirs $(OBJS)
|
stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp
|
|||||||
|
|
||||||
objs/stencil.o: objs/stencil_ispc.h
|
objs/stencil.o: objs/stencil_ispc.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -42,10 +42,43 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
#include "stencil_ispc.h"
|
#include "stencil_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||||
int y0, int y1, int z0, int z1,
|
int y0, int y1, int z0, int z1,
|
||||||
int Nx, int Ny, int Nz,
|
int Nx, int Ny, int Nz,
|
||||||
@@ -67,6 +100,8 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
|
|||||||
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
int Nx = 256, Ny = 256, Nz = 256;
|
int Nx = 256, Ny = 256, Nz = 256;
|
||||||
int width = 4;
|
int width = 4;
|
||||||
float *Aserial[2], *Aispc[2];
|
float *Aserial[2], *Aispc[2];
|
||||||
|
|||||||
@@ -43,8 +43,9 @@ stencil_step(uniform int x0, uniform int x1,
|
|||||||
|
|
||||||
for (uniform int z = z0; z < z1; ++z) {
|
for (uniform int z = z0; z < z1; ++z) {
|
||||||
for (uniform int y = y0; y < y1; ++y) {
|
for (uniform int y = y0; y < y1; ++y) {
|
||||||
foreach (x = x0 ... x1) {
|
// Assumes that (x1-x0) % programCount == 0
|
||||||
int index = (z * Nxy) + (y * Nx) + x;
|
for (uniform int x = x0; x < x1; x += programCount) {
|
||||||
|
int index = (z * Nxy) + (y * Nx) + x + programIndex;
|
||||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
float div = coef[0] * A_cur(0, 0, 0) +
|
float div = coef[0] * A_cur(0, 0, 0) +
|
||||||
|
|||||||
@@ -64,19 +64,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -101,7 +96,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -119,7 +113,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -138,7 +131,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -152,21 +144,21 @@
|
|||||||
<CustomBuild Include="stencil.ispc">
|
<CustomBuild Include="stencil.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -53,7 +53,9 @@
|
|||||||
#define ISPC_USE_PTHREADS
|
#define ISPC_USE_PTHREADS
|
||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
#define ISPC_IS_APPLE
|
#define ISPC_IS_APPLE
|
||||||
#define ISPC_USE_GCD
|
// pthreads is noticably more efficient than GCD on OSX
|
||||||
|
#define ISPC_USE_PTHREADS
|
||||||
|
//#define ISPC_USE_GCD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define DBG(x)
|
#define DBG(x)
|
||||||
@@ -110,7 +112,7 @@ struct TaskInfo {
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// TaskGroupBase
|
// TaskGroupBase
|
||||||
|
|
||||||
#define LOG_TASK_QUEUE_CHUNK_SIZE 14
|
#define LOG_TASK_QUEUE_CHUNK_SIZE 12
|
||||||
#define MAX_TASK_QUEUE_CHUNKS 8
|
#define MAX_TASK_QUEUE_CHUNKS 8
|
||||||
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
||||||
|
|
||||||
@@ -157,6 +159,7 @@ private:
|
|||||||
int memBufferSize[NUM_MEM_BUFFERS];
|
int memBufferSize[NUM_MEM_BUFFERS];
|
||||||
char *memBuffers[NUM_MEM_BUFFERS];
|
char *memBuffers[NUM_MEM_BUFFERS];
|
||||||
char mem[256];
|
char mem[256];
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -8,10 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
|||||||
CXX=g++
|
CXX=g++
|
||||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
|
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||||
|
|
||||||
OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
|
|
||||||
objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
|
|
||||||
|
|
||||||
default: volume
|
default: volume
|
||||||
|
|
||||||
@@ -23,8 +20,8 @@ dirs:
|
|||||||
clean:
|
clean:
|
||||||
/bin/rm -rf objs *~ volume
|
/bin/rm -rf objs *~ volume
|
||||||
|
|
||||||
volume: dirs $(OBJS)
|
volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||||
|
|
||||||
objs/%.o: %.cpp
|
objs/%.o: %.cpp
|
||||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||||
@@ -34,5 +31,5 @@ objs/%.o: ../%.cpp
|
|||||||
|
|
||||||
objs/volume.o: objs/volume_ispc.h
|
objs/volume.o: objs/volume_ispc.h
|
||||||
|
|
||||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
|
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||||
|
|||||||
@@ -41,6 +41,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
#include "volume_ispc.h"
|
#include "volume_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
@@ -69,6 +70,37 @@ writePPM(float *buf, int width, int height, const char *fn) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Load image and viewing parameters from a camera data file.
|
/* Load image and viewing parameters from a camera data file.
|
||||||
FIXME: we should add support to be able to specify viewing parameters
|
FIXME: we should add support to be able to specify viewing parameters
|
||||||
in the program here directly. */
|
in the program here directly. */
|
||||||
@@ -140,6 +172,8 @@ int main(int argc, char *argv[]) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Load viewing data and the volume density data
|
// Load viewing data and the volume density data
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ struct Ray {
|
|||||||
static void
|
static void
|
||||||
generateRay(const uniform float raster2camera[4][4],
|
generateRay(const uniform float raster2camera[4][4],
|
||||||
const uniform float camera2world[4][4],
|
const uniform float camera2world[4][4],
|
||||||
float x, float y, Ray &ray) {
|
float x, float y, reference Ray ray) {
|
||||||
// transform raster coordinate (x, y, 0) to camera space
|
// transform raster coordinate (x, y, 0) to camera space
|
||||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||||
@@ -70,7 +70,7 @@ Inside(float3 p, float3 pMin, float3 pMax) {
|
|||||||
|
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
|
IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
|
||||||
float t0 = -1e30, t1 = 1e30;
|
float t0 = -1e30, t1 = 1e30;
|
||||||
|
|
||||||
float3 tNear = (pMin - ray.origin) / ray.dir;
|
float3 tNear = (pMin - ray.origin) / ray.dir;
|
||||||
@@ -141,7 +141,7 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
|||||||
|
|
||||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||||
uniform float density[], uniform int nVoxels[3],
|
uniform float density[], uniform int nVoxels[3],
|
||||||
uniform bool &checkForSameVoxel) {
|
reference uniform bool checkForSameVoxel) {
|
||||||
if (!Inside(Pobj, pMin, pMax))
|
if (!Inside(Pobj, pMin, pMax))
|
||||||
return 0;
|
return 0;
|
||||||
// Compute voxel coordinates and offsets for _Pobj_
|
// Compute voxel coordinates and offsets for _Pobj_
|
||||||
@@ -155,8 +155,8 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
|||||||
// Trilinearly interpolate density values to compute local density
|
// Trilinearly interpolate density values to compute local density
|
||||||
float d00, d10, d01, d11;
|
float d00, d10, d01, d11;
|
||||||
uniform int uvx, uvy, uvz;
|
uniform int uvx, uvy, uvz;
|
||||||
if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) &&
|
if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
|
||||||
reduce_equal(vz, &uvz)) {
|
reduce_equal(vz, uvz)) {
|
||||||
// If all of the program instances are inside the same voxel, then
|
// If all of the program instances are inside the same voxel, then
|
||||||
// we'll call the 'uniform' variant of the voxel density lookup
|
// we'll call the 'uniform' variant of the voxel density lookup
|
||||||
// function, thus doing a single load for each value rather than a
|
// function, thus doing a single load for each value rather than a
|
||||||
@@ -310,7 +310,11 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
|||||||
// by 4.
|
// by 4.
|
||||||
for (uniform int y = y0; y < y1; y += 4) {
|
for (uniform int y = y0; y < y1; y += 4) {
|
||||||
for (uniform int x = x0; x < x1; x += 4) {
|
for (uniform int x = x0; x < x1; x += 4) {
|
||||||
foreach (o = 0 ... 16) {
|
// For each such tile, process programCount pixels at a time,
|
||||||
|
// until we've done all 16 of them. Thus, we're also assuming
|
||||||
|
// that programCount <= 16 and that 16 is evenly dividible by
|
||||||
|
// programCount.
|
||||||
|
for (uniform int o = 0; o < 16; o += programCount) {
|
||||||
// These two arrays encode the mapping from [0,15] to
|
// These two arrays encode the mapping from [0,15] to
|
||||||
// offsets within the 4x4 pixel block so that we render
|
// offsets within the 4x4 pixel block so that we render
|
||||||
// each pixel inside the block
|
// each pixel inside the block
|
||||||
@@ -320,7 +324,8 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
|||||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||||
|
|
||||||
// Figure out the pixel to render for this program instance
|
// Figure out the pixel to render for this program instance
|
||||||
int xo = x + xoffsets[o], yo = y + yoffsets[o];
|
int xo = x + xoffsets[o + programIndex];
|
||||||
|
int yo = y + yoffsets[o + programIndex];
|
||||||
|
|
||||||
// Use viewing parameters to compute the corresponding ray
|
// Use viewing parameters to compute the corresponding ray
|
||||||
// for the pixel
|
// for the pixel
|
||||||
@@ -347,7 +352,7 @@ volume_task(uniform float density[], uniform int nVoxels[3],
|
|||||||
uniform int ybuckets = (height + (dy-1)) / dy;
|
uniform int ybuckets = (height + (dy-1)) / dy;
|
||||||
|
|
||||||
uniform int x0 = (taskIndex % xbuckets) * dx;
|
uniform int x0 = (taskIndex % xbuckets) * dx;
|
||||||
uniform int y0 = (taskIndex / xbuckets) * dy;
|
uniform int y0 = (taskIndex / ybuckets) * dy;
|
||||||
uniform int x1 = x0 + dx, y1 = y0 + dy;
|
uniform int x1 = x0 + dx, y1 = y0 + dy;
|
||||||
x1 = min(x1, width);
|
x1 = min(x1, width);
|
||||||
y1 = min(y1, height);
|
y1 = min(y1, height);
|
||||||
|
|||||||
@@ -64,19 +64,15 @@
|
|||||||
<PropertyGroup Label="UserMacros" />
|
<PropertyGroup Label="UserMacros" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||||
<LinkIncremental>true</LinkIncremental>
|
<LinkIncremental>true</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||||
<LinkIncremental>false</LinkIncremental>
|
<LinkIncremental>false</LinkIncremental>
|
||||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
@@ -85,7 +81,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -101,7 +96,6 @@
|
|||||||
<WarningLevel>Level3</WarningLevel>
|
<WarningLevel>Level3</WarningLevel>
|
||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
@@ -119,7 +113,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -138,7 +131,6 @@
|
|||||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
@@ -156,18 +148,18 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="volume.ispc">
|
<CustomBuild Include="volume.ispc">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||||
</Command>
|
</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
|||||||
@@ -36,6 +36,9 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
// Just enough of a float3 class to do what we need in this file.
|
// Just enough of a float3 class to do what we need in this file.
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
__declspec(align(16))
|
||||||
|
#endif
|
||||||
struct float3 {
|
struct float3 {
|
||||||
float3() { }
|
float3() { }
|
||||||
float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
|
float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
|
||||||
@@ -295,7 +298,7 @@ volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
|
|||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
for (int x = 0; x < width; ++x, ++offset) {
|
for (int x = 0; x < width; ++x, ++offset) {
|
||||||
Ray ray;
|
Ray ray;
|
||||||
generateRay(raster2camera, camera2world, (float)x, (float)y, ray);
|
generateRay(raster2camera, camera2world, x, y, ray);
|
||||||
image[offset] = raymarch(density, nVoxels, ray);
|
image[offset] = raymarch(density, nVoxels, ray);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
154
expr.h
154
expr.h
@@ -39,9 +39,10 @@
|
|||||||
#define ISPC_EXPR_H 1
|
#define ISPC_EXPR_H 1
|
||||||
|
|
||||||
#include "ispc.h"
|
#include "ispc.h"
|
||||||
#include "ast.h"
|
|
||||||
#include "type.h"
|
#include "type.h"
|
||||||
|
|
||||||
|
class FunctionSymbolExpr;
|
||||||
|
|
||||||
/** @brief Expr is the abstract base class that defines the interface that
|
/** @brief Expr is the abstract base class that defines the interface that
|
||||||
all expression types must implement.
|
all expression types must implement.
|
||||||
*/
|
*/
|
||||||
@@ -65,10 +66,6 @@ public:
|
|||||||
/** Returns the Type of the expression. */
|
/** Returns the Type of the expression. */
|
||||||
virtual const Type *GetType() const = 0;
|
virtual const Type *GetType() const = 0;
|
||||||
|
|
||||||
/** Returns the type of the value returned by GetLValueType(); this
|
|
||||||
should be a pointer type of some sort (uniform or varying). */
|
|
||||||
virtual const Type *GetLValueType() const;
|
|
||||||
|
|
||||||
/** For expressions that have values based on a symbol (e.g. regular
|
/** For expressions that have values based on a symbol (e.g. regular
|
||||||
symbol references, array indexing, etc.), this returns a pointer to
|
symbol references, array indexing, etc.), this returns a pointer to
|
||||||
that symbol. */
|
that symbol. */
|
||||||
@@ -93,6 +90,14 @@ public:
|
|||||||
|
|
||||||
/** Prints the expression to standard output (used for debugging). */
|
/** Prints the expression to standard output (used for debugging). */
|
||||||
virtual void Print() const = 0;
|
virtual void Print() const = 0;
|
||||||
|
|
||||||
|
/** This method tries to convert the expression to the given type. In
|
||||||
|
the event of failure, if the failureOk parameter is true, then no
|
||||||
|
error is issued. If failureOk is false, then an error is printed
|
||||||
|
that incorporates the given error message string. In either
|
||||||
|
failure case, NULL is returned. */
|
||||||
|
Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL,
|
||||||
|
bool failureOk = false, bool issuePrecisionWarnings = true);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -260,6 +265,10 @@ public:
|
|||||||
ExprList *args;
|
ExprList *args;
|
||||||
bool isLaunch;
|
bool isLaunch;
|
||||||
Expr *launchCountExpr;
|
Expr *launchCountExpr;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void resolveFunctionOverloads(bool exactMatchOnly);
|
||||||
|
bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -270,12 +279,11 @@ public:
|
|||||||
*/
|
*/
|
||||||
class IndexExpr : public Expr {
|
class IndexExpr : public Expr {
|
||||||
public:
|
public:
|
||||||
IndexExpr(Expr *baseExpr, Expr *index, SourcePos p);
|
IndexExpr(Expr *arrayOrVector, Expr *index, SourcePos p);
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||||
const Type *GetType() const;
|
const Type *GetType() const;
|
||||||
const Type *GetLValueType() const;
|
|
||||||
Symbol *GetBaseSymbol() const;
|
Symbol *GetBaseSymbol() const;
|
||||||
void Print() const;
|
void Print() const;
|
||||||
|
|
||||||
@@ -283,7 +291,7 @@ public:
|
|||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
int EstimateCost() const;
|
int EstimateCost() const;
|
||||||
|
|
||||||
Expr *baseExpr, *index;
|
Expr *arrayOrVector, *index;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -293,35 +301,28 @@ public:
|
|||||||
*/
|
*/
|
||||||
class MemberExpr : public Expr {
|
class MemberExpr : public Expr {
|
||||||
public:
|
public:
|
||||||
static MemberExpr *create(Expr *expr, const char *identifier,
|
static MemberExpr* create(Expr *expr, const char *identifier,
|
||||||
SourcePos pos, SourcePos identifierPos,
|
SourcePos pos, SourcePos identifierPos);
|
||||||
bool derefLvalue);
|
|
||||||
|
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
||||||
|
SourcePos identifierPos);
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||||
const Type *GetType() const;
|
const Type *GetType() const;
|
||||||
const Type *GetLValueType() const;
|
|
||||||
Symbol *GetBaseSymbol() const;
|
Symbol *GetBaseSymbol() const;
|
||||||
void Print() const;
|
void Print() const;
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
int EstimateCost() const;
|
int EstimateCost() const;
|
||||||
|
|
||||||
virtual int getElementNumber() const = 0;
|
virtual int getElementNumber() const;
|
||||||
virtual const Type *getElementType() const = 0;
|
|
||||||
std::string getCandidateNearMatches() const;
|
std::string getCandidateNearMatches() const;
|
||||||
|
|
||||||
Expr *expr;
|
Expr *expr;
|
||||||
std::string identifier;
|
std::string identifier;
|
||||||
const SourcePos identifierPos;
|
const SourcePos identifierPos;
|
||||||
|
|
||||||
protected:
|
|
||||||
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
|
||||||
SourcePos identifierPos, bool derefLValue);
|
|
||||||
|
|
||||||
/** Indicates whether the expression should be dereferenced before the
|
|
||||||
member is found. (i.e. this is true if the MemberExpr was a '->'
|
|
||||||
operator, and is false if it was a '.' operator. */
|
|
||||||
bool dereferenceExpr;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -493,8 +494,7 @@ private:
|
|||||||
probably-different type. */
|
probably-different type. */
|
||||||
class TypeCastExpr : public Expr {
|
class TypeCastExpr : public Expr {
|
||||||
public:
|
public:
|
||||||
TypeCastExpr(const Type *t, Expr *e, bool preserveUniformity,
|
TypeCastExpr(const Type *t, Expr *e, SourcePos p);
|
||||||
SourcePos p);
|
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
const Type *GetType() const;
|
const Type *GetType() const;
|
||||||
@@ -502,12 +502,9 @@ public:
|
|||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
int EstimateCost() const;
|
int EstimateCost() const;
|
||||||
Symbol *GetBaseSymbol() const;
|
|
||||||
llvm::Constant *GetConstant(const Type *type) const;
|
|
||||||
|
|
||||||
const Type *type;
|
const Type *type;
|
||||||
Expr *expr;
|
Expr *expr;
|
||||||
bool preserveUniformity;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -519,7 +516,6 @@ public:
|
|||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
const Type *GetType() const;
|
const Type *GetType() const;
|
||||||
const Type *GetLValueType() const;
|
|
||||||
Symbol *GetBaseSymbol() const;
|
Symbol *GetBaseSymbol() const;
|
||||||
void Print() const;
|
void Print() const;
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
@@ -539,7 +535,6 @@ public:
|
|||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||||
const Type *GetType() const;
|
const Type *GetType() const;
|
||||||
const Type *GetLValueType() const;
|
|
||||||
Symbol *GetBaseSymbol() const;
|
Symbol *GetBaseSymbol() const;
|
||||||
void Print() const;
|
void Print() const;
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
@@ -550,44 +545,6 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** Expression that represents taking the address of an expression. */
|
|
||||||
class AddressOfExpr : public Expr {
|
|
||||||
public:
|
|
||||||
AddressOfExpr(Expr *e, SourcePos p);
|
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
|
||||||
const Type *GetType() const;
|
|
||||||
Symbol *GetBaseSymbol() const;
|
|
||||||
void Print() const;
|
|
||||||
Expr *TypeCheck();
|
|
||||||
Expr *Optimize();
|
|
||||||
int EstimateCost() const;
|
|
||||||
|
|
||||||
Expr *expr;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/** Expression that returns the size of the given expression or type in
|
|
||||||
bytes. */
|
|
||||||
class SizeOfExpr : public Expr {
|
|
||||||
public:
|
|
||||||
SizeOfExpr(Expr *e, SourcePos p);
|
|
||||||
SizeOfExpr(const Type *t, SourcePos p);
|
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
|
||||||
const Type *GetType() const;
|
|
||||||
void Print() const;
|
|
||||||
Expr *TypeCheck();
|
|
||||||
Expr *Optimize();
|
|
||||||
int EstimateCost() const;
|
|
||||||
|
|
||||||
/* One of expr or type should be non-NULL (but not both of them). The
|
|
||||||
SizeOfExpr returns the size of whichever one of them isn't NULL. */
|
|
||||||
Expr *expr;
|
|
||||||
const Type *type;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/** @brief Expression representing a symbol reference in the program */
|
/** @brief Expression representing a symbol reference in the program */
|
||||||
class SymbolExpr : public Expr {
|
class SymbolExpr : public Expr {
|
||||||
public:
|
public:
|
||||||
@@ -596,7 +553,6 @@ public:
|
|||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||||
const Type *GetType() const;
|
const Type *GetType() const;
|
||||||
const Type *GetLValueType() const;
|
|
||||||
Symbol *GetBaseSymbol() const;
|
Symbol *GetBaseSymbol() const;
|
||||||
Expr *TypeCheck();
|
Expr *TypeCheck();
|
||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
@@ -613,7 +569,7 @@ private:
|
|||||||
*/
|
*/
|
||||||
class FunctionSymbolExpr : public Expr {
|
class FunctionSymbolExpr : public Expr {
|
||||||
public:
|
public:
|
||||||
FunctionSymbolExpr(const char *name, const std::vector<Symbol *> &candFuncs,
|
FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
|
||||||
SourcePos pos);
|
SourcePos pos);
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||||
@@ -623,25 +579,9 @@ public:
|
|||||||
Expr *Optimize();
|
Expr *Optimize();
|
||||||
void Print() const;
|
void Print() const;
|
||||||
int EstimateCost() const;
|
int EstimateCost() const;
|
||||||
llvm::Constant *GetConstant(const Type *type) const;
|
|
||||||
|
|
||||||
/** Given the types of the function arguments, in the presence of
|
|
||||||
function overloading, this method resolves which actual function
|
|
||||||
the arguments match best. If the argCouldBeNULL parameter is
|
|
||||||
non-NULL, each element indicates whether the corresponding argument
|
|
||||||
is the number zero, indicating that it could be a NULL pointer.
|
|
||||||
This parameter may be NULL (for cases where overload resolution is
|
|
||||||
being done just given type information without the parameter
|
|
||||||
argument expressions being available. It returns true on success.
|
|
||||||
*/
|
|
||||||
bool ResolveOverloads(const std::vector<const Type *> &argTypes,
|
|
||||||
const std::vector<bool> *argCouldBeNULL = NULL);
|
|
||||||
Symbol *GetMatchingFunction();
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool tryResolve(int (*matchFunc)(const Type *, const Type *),
|
friend class FunctionCallExpr;
|
||||||
const std::vector<const Type *> &argTypes,
|
|
||||||
const std::vector<bool> *argCouldBeNULL);
|
|
||||||
|
|
||||||
/** Name of the function that is being called. */
|
/** Name of the function that is being called. */
|
||||||
std::string name;
|
std::string name;
|
||||||
@@ -649,12 +589,11 @@ private:
|
|||||||
/** All of the functions with the name given in the function call;
|
/** All of the functions with the name given in the function call;
|
||||||
there may be more then one, in which case we need to resolve which
|
there may be more then one, in which case we need to resolve which
|
||||||
overload is the best match. */
|
overload is the best match. */
|
||||||
std::vector<Symbol *> candidateFunctions;
|
std::vector<Symbol *> *candidateFunctions;
|
||||||
|
|
||||||
/** The actual matching function found after overload resolution. */
|
/** The actual matching function found after overload resolution; this
|
||||||
|
value is set by FunctionCallExpr::resolveFunctionOverloads() */
|
||||||
Symbol *matchingFunc;
|
Symbol *matchingFunc;
|
||||||
|
|
||||||
bool triedToResolve;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -672,37 +611,4 @@ public:
|
|||||||
int EstimateCost() const;
|
int EstimateCost() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** @brief An expression that represents a NULL pointer. */
|
|
||||||
class NullPointerExpr : public Expr {
|
|
||||||
public:
|
|
||||||
NullPointerExpr(SourcePos p) : Expr(p) { }
|
|
||||||
|
|
||||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
|
||||||
const Type *GetType() const;
|
|
||||||
Expr *TypeCheck();
|
|
||||||
Expr *Optimize();
|
|
||||||
void Print() const;
|
|
||||||
int EstimateCost() const;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/** This function indicates whether it's legal to convert from fromType to
|
|
||||||
toType. If the optional errorMsgBase and source position parameters
|
|
||||||
are provided, then an error message is issued if the type conversion
|
|
||||||
isn't possible.
|
|
||||||
*/
|
|
||||||
bool CanConvertTypes(const Type *fromType, const Type *toType,
|
|
||||||
const char *errorMsgBase = NULL,
|
|
||||||
SourcePos pos = SourcePos());
|
|
||||||
|
|
||||||
/** This function attempts to convert the given expression to the given
|
|
||||||
type, returning a pointer to a new expression that is the result. If
|
|
||||||
the required type conversion is illegal, it returns NULL and prints an
|
|
||||||
error message using the provided string to indicate the context for
|
|
||||||
which type conversion was being applied (e.g. "function call
|
|
||||||
parameter").
|
|
||||||
*/
|
|
||||||
Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);
|
|
||||||
|
|
||||||
#endif // ISPC_EXPR_H
|
#endif // ISPC_EXPR_H
|
||||||
|
|||||||
@@ -14,10 +14,10 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
|
|||||||
varying int3 vv = array[a];
|
varying int3 vv = array[a];
|
||||||
++vv.y;
|
++vv.y;
|
||||||
array[a] = vv;
|
array[a] = vv;
|
||||||
|
//CO print("fin %\n", array[programIndex].y);
|
||||||
ret[programIndex] = array[programIndex].y;
|
ret[programIndex] = array[programIndex].y;
|
||||||
}
|
}
|
||||||
|
|
||||||
export void result(uniform float ret[]) {
|
export void result(uniform float ret[]) {
|
||||||
ret[programIndex] = 101+programIndex;
|
ret[programIndex] = 100+programIndex;
|
||||||
ret[0] = 100;
|
|
||||||
}
|
}
|
||||||
@@ -8,6 +8,9 @@ struct Foo {
|
|||||||
float y;
|
float y;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
extern void aa(reference Foo f);
|
||||||
|
extern void bb(reference Foo f[]);
|
||||||
|
|
||||||
typedef float<3> float3;
|
typedef float<3> float3;
|
||||||
|
|
||||||
void set(uniform float3 f[], int offset, float3 val) {
|
void set(uniform float3 f[], int offset, float3 val) {
|
||||||
414
func.cpp
414
func.cpp
@@ -1,414 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright (c) 2011, Intel Corporation
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the name of Intel Corporation nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived from
|
|
||||||
this software without specific prior written permission.
|
|
||||||
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/** @file func.cpp
|
|
||||||
@brief
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "func.h"
|
|
||||||
#include "ctx.h"
|
|
||||||
#include "expr.h"
|
|
||||||
#include "llvmutil.h"
|
|
||||||
#include "module.h"
|
|
||||||
#include "type.h"
|
|
||||||
#include "stmt.h"
|
|
||||||
#include "sym.h"
|
|
||||||
#include "util.h"
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#include <llvm/LLVMContext.h>
|
|
||||||
#include <llvm/Module.h>
|
|
||||||
#include <llvm/Type.h>
|
|
||||||
#include <llvm/DerivedTypes.h>
|
|
||||||
#include <llvm/Instructions.h>
|
|
||||||
#include <llvm/Intrinsics.h>
|
|
||||||
#include <llvm/PassManager.h>
|
|
||||||
#include <llvm/PassRegistry.h>
|
|
||||||
#include <llvm/Transforms/IPO.h>
|
|
||||||
#include <llvm/Support/FormattedStream.h>
|
|
||||||
#include <llvm/Support/FileUtilities.h>
|
|
||||||
#include <llvm/Target/TargetMachine.h>
|
|
||||||
#include <llvm/Target/TargetOptions.h>
|
|
||||||
#include <llvm/Target/TargetData.h>
|
|
||||||
#include <llvm/PassManager.h>
|
|
||||||
#include <llvm/Analysis/Verifier.h>
|
|
||||||
#include <llvm/Support/CFG.h>
|
|
||||||
#include <llvm/Support/ToolOutputFile.h>
|
|
||||||
#include <llvm/Assembly/PrintModulePass.h>
|
|
||||||
|
|
||||||
Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
|
|
||||||
sym = s;
|
|
||||||
args = a;
|
|
||||||
code = c;
|
|
||||||
|
|
||||||
maskSymbol = m->symbolTable->LookupVariable("__mask");
|
|
||||||
assert(maskSymbol != NULL);
|
|
||||||
|
|
||||||
if (code != NULL) {
|
|
||||||
if (g->debugPrint) {
|
|
||||||
fprintf(stderr, "Creating function \"%s\". Initial code:\n",
|
|
||||||
sym->name.c_str());
|
|
||||||
code->Print(0);
|
|
||||||
fprintf(stderr, "---------------------\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
code = code->TypeCheck();
|
|
||||||
|
|
||||||
if (code != NULL && g->debugPrint) {
|
|
||||||
fprintf(stderr, "After typechecking function \"%s\":\n",
|
|
||||||
sym->name.c_str());
|
|
||||||
code->Print(0);
|
|
||||||
fprintf(stderr, "---------------------\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (code != NULL) {
|
|
||||||
code = code->Optimize();
|
|
||||||
if (g->debugPrint) {
|
|
||||||
fprintf(stderr, "After optimizing function \"%s\":\n",
|
|
||||||
sym->name.c_str());
|
|
||||||
code->Print(0);
|
|
||||||
fprintf(stderr, "---------------------\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (g->debugPrint) {
|
|
||||||
printf("Add Function %s\n", sym->name.c_str());
|
|
||||||
code->Print(0);
|
|
||||||
printf("\n\n\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
|
||||||
assert(type != NULL);
|
|
||||||
|
|
||||||
for (unsigned int i = 0; i < args.size(); ++i)
|
|
||||||
if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
|
|
||||||
args[i]->parentFunction = this;
|
|
||||||
|
|
||||||
if (type->isTask) {
|
|
||||||
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
|
|
||||||
assert(threadIndexSym);
|
|
||||||
threadCountSym = m->symbolTable->LookupVariable("threadCount");
|
|
||||||
assert(threadCountSym);
|
|
||||||
taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
|
|
||||||
assert(taskIndexSym);
|
|
||||||
taskCountSym = m->symbolTable->LookupVariable("taskCount");
|
|
||||||
assert(taskCountSym);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
const Type *
|
|
||||||
Function::GetReturnType() const {
|
|
||||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
|
||||||
assert(type != NULL);
|
|
||||||
return type->GetReturnType();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
const FunctionType *
|
|
||||||
Function::GetType() const {
|
|
||||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
|
||||||
assert(type != NULL);
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Parameters for tasks are stored in a big structure; this utility
|
|
||||||
function emits code to copy those values out of the task structure into
|
|
||||||
local stack-allocated variables. (Which we expect that LLVM's
|
|
||||||
'mem2reg' pass will in turn promote to SSA registers..
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
|
|
||||||
FunctionEmitContext *ctx) {
|
|
||||||
// We expect the argument structure to come in as a poitner to a
|
|
||||||
// structure. Confirm and figure out its type here.
|
|
||||||
const llvm::Type *structArgType = structArgPtr->getType();
|
|
||||||
assert(llvm::isa<llvm::PointerType>(structArgType));
|
|
||||||
const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(structArgType);
|
|
||||||
assert(llvm::isa<llvm::StructType>(pt->getElementType()));
|
|
||||||
const llvm::StructType *argStructType =
|
|
||||||
llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
|
|
||||||
|
|
||||||
// Get the type of the argument we're copying in and its Symbol pointer
|
|
||||||
LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
|
|
||||||
Symbol *sym = args[i];
|
|
||||||
|
|
||||||
// allocate space to copy the parameter in to
|
|
||||||
sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
|
|
||||||
|
|
||||||
// get a pointer to the value in the struct
|
|
||||||
llvm::Value *ptr = ctx->AddElementOffset(structArgPtr, i, NULL, sym->name.c_str());
|
|
||||||
|
|
||||||
// and copy the value from the struct and into the local alloca'ed
|
|
||||||
// memory
|
|
||||||
llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
|
|
||||||
ctx->StoreInst(ptrval, sym->storagePtr);
|
|
||||||
ctx->EmitFunctionParameterDebugInfo(sym);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Given the statements implementing a function, emit the code that
|
|
||||||
implements the function. Most of the work do be done here just
|
|
||||||
involves wiring up the function parameter values to be available in the
|
|
||||||
function body code.
|
|
||||||
*/
|
|
||||||
void
|
|
||||||
Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
|
||||||
SourcePos firstStmtPos) {
|
|
||||||
llvm::Value *maskPtr = ctx->AllocaInst(LLVMTypes::MaskType, "mask_memory");
|
|
||||||
ctx->StoreInst(LLVMMaskAllOn, maskPtr);
|
|
||||||
maskSymbol->storagePtr = maskPtr;
|
|
||||||
ctx->SetMaskPointer(maskPtr);
|
|
||||||
|
|
||||||
// add debugging info for __mask, programIndex, ...
|
|
||||||
maskSymbol->pos = firstStmtPos;
|
|
||||||
ctx->EmitVariableDebugInfo(maskSymbol);
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
|
|
||||||
#endif
|
|
||||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
|
||||||
assert(type != NULL);
|
|
||||||
if (type->isTask == true) {
|
|
||||||
// For tasks, we there should always be three parmeters: the
|
|
||||||
// pointer to the structure that holds all of the arguments, the
|
|
||||||
// thread index, and the thread count variables.
|
|
||||||
llvm::Function::arg_iterator argIter = function->arg_begin();
|
|
||||||
llvm::Value *structParamPtr = argIter++;
|
|
||||||
llvm::Value *threadIndex = argIter++;
|
|
||||||
llvm::Value *threadCount = argIter++;
|
|
||||||
llvm::Value *taskIndex = argIter++;
|
|
||||||
llvm::Value *taskCount = argIter++;
|
|
||||||
|
|
||||||
// Copy the function parameter values from the structure into local
|
|
||||||
// storage
|
|
||||||
for (unsigned int i = 0; i < args.size(); ++i)
|
|
||||||
lCopyInTaskParameter(i, structParamPtr, args, ctx);
|
|
||||||
|
|
||||||
// Copy in the mask as well.
|
|
||||||
int nArgs = (int)args.size();
|
|
||||||
// The mask is the last parameter in the argument structure
|
|
||||||
llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
|
|
||||||
"task_struct_mask");
|
|
||||||
llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
|
|
||||||
ctx->SetFunctionMask(ptrval);
|
|
||||||
|
|
||||||
// Copy threadIndex and threadCount into stack-allocated storage so
|
|
||||||
// that their symbols point to something reasonable.
|
|
||||||
threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
|
|
||||||
ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
|
|
||||||
|
|
||||||
threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
|
|
||||||
ctx->StoreInst(threadCount, threadCountSym->storagePtr);
|
|
||||||
|
|
||||||
// Copy taskIndex and taskCount into stack-allocated storage so
|
|
||||||
// that their symbols point to something reasonable.
|
|
||||||
taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
|
|
||||||
ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
|
|
||||||
|
|
||||||
taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
|
|
||||||
ctx->StoreInst(taskCount, taskCountSym->storagePtr);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// Regular, non-task function
|
|
||||||
llvm::Function::arg_iterator argIter = function->arg_begin();
|
|
||||||
for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
|
|
||||||
Symbol *sym = args[i];
|
|
||||||
argIter->setName(sym->name.c_str());
|
|
||||||
|
|
||||||
// Allocate stack storage for the parameter and emit code
|
|
||||||
// to store the its value there.
|
|
||||||
sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
|
|
||||||
ctx->StoreInst(argIter, sym->storagePtr);
|
|
||||||
ctx->EmitFunctionParameterDebugInfo(sym);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the number of actual function arguments is equal to the
|
|
||||||
// number of declared arguments in decl->functionParams, then we
|
|
||||||
// don't have a mask parameter, so set it to be all on. This
|
|
||||||
// happens for exmaple with 'export'ed functions that the app
|
|
||||||
// calls.
|
|
||||||
if (argIter == function->arg_end())
|
|
||||||
ctx->SetFunctionMask(LLVMMaskAllOn);
|
|
||||||
else {
|
|
||||||
// Otherwise use the mask to set the entry mask value
|
|
||||||
argIter->setName("__mask");
|
|
||||||
assert(argIter->getType() == LLVMTypes::MaskType);
|
|
||||||
ctx->SetFunctionMask(argIter);
|
|
||||||
assert(++argIter == function->arg_end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finally, we can generate code for the function
|
|
||||||
if (code != NULL) {
|
|
||||||
int costEstimate = code->EstimateCost();
|
|
||||||
bool checkMask = (type->isTask == true) ||
|
|
||||||
((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
|
|
||||||
costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
|
|
||||||
Debug(code->pos, "Estimated cost for function \"%s\" = %d\n",
|
|
||||||
sym->name.c_str(), costEstimate);
|
|
||||||
// If the body of the function is non-trivial, then we wrap the
|
|
||||||
// entire thing around a varying "cif (true)" test in order to reap
|
|
||||||
// the side-effect benefit of checking to see if the execution mask
|
|
||||||
// is all on and thence having a specialized code path for that
|
|
||||||
// case. If this is a simple function, then this isn't worth the
|
|
||||||
// code bloat / overhead.
|
|
||||||
if (checkMask) {
|
|
||||||
bool allTrue[ISPC_MAX_NVEC];
|
|
||||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
|
||||||
allTrue[i] = true;
|
|
||||||
Expr *trueExpr = new ConstExpr(AtomicType::VaryingBool, allTrue,
|
|
||||||
code->pos);
|
|
||||||
code = new IfStmt(trueExpr, code, NULL, true, code->pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx->SetDebugPos(code->pos);
|
|
||||||
ctx->AddInstrumentationPoint("function entry");
|
|
||||||
code->EmitCode(ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx->GetCurrentBasicBlock()) {
|
|
||||||
// FIXME: We'd like to issue a warning if we've reached the end of
|
|
||||||
// the function without a return statement (for non-void
|
|
||||||
// functions). But the test below isn't right, since we can have
|
|
||||||
// (with 'x' a varying test) "if (x) return a; else return b;", in
|
|
||||||
// which case we have a valid basic block but its unreachable so ok
|
|
||||||
// to not have return statement.
|
|
||||||
#if 0
|
|
||||||
// If the bblock has no predecessors, then it doesn't matter if it
|
|
||||||
// doesn't have a return; it'll never be reached. If it does,
|
|
||||||
// issue a warning. Also need to warn if it's the entry block for
|
|
||||||
// the function (in which case it will not have predeccesors but is
|
|
||||||
// still reachable.)
|
|
||||||
if (type->GetReturnType() != AtomicType::Void &&
|
|
||||||
(pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
|
|
||||||
Warning(sym->pos, "Missing return statement in function returning \"%s\".",
|
|
||||||
type->rType->GetString().c_str());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// FIXME: would like to set the context's current position to
|
|
||||||
// e.g. the end of the function code
|
|
||||||
|
|
||||||
// if bblock is non-NULL, it hasn't been terminated by e.g. a
|
|
||||||
// return instruction. Need to add a return instruction.
|
|
||||||
ctx->ReturnInst();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void
|
|
||||||
Function::GenerateIR() {
|
|
||||||
if (sym == NULL)
|
|
||||||
// May be NULL due to error earlier in compilation
|
|
||||||
return;
|
|
||||||
|
|
||||||
llvm::Function *function = sym->function;
|
|
||||||
assert(function != NULL);
|
|
||||||
|
|
||||||
// But if that function has a definition, we don't want to redefine it.
|
|
||||||
if (function->empty() == false) {
|
|
||||||
Error(sym->pos, "Ignoring redefinition of function \"%s\".",
|
|
||||||
sym->name.c_str());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Figure out a reasonable source file position for the start of the
|
|
||||||
// function body. If possible, get the position of the first actual
|
|
||||||
// non-StmtList statment...
|
|
||||||
SourcePos firstStmtPos = sym->pos;
|
|
||||||
if (code) {
|
|
||||||
StmtList *sl = dynamic_cast<StmtList *>(code);
|
|
||||||
if (sl && sl->GetStatements().size() > 0 &&
|
|
||||||
sl->GetStatements()[0] != NULL)
|
|
||||||
firstStmtPos = sl->GetStatements()[0]->pos;
|
|
||||||
else
|
|
||||||
firstStmtPos = code->pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
// And we can now go ahead and emit the code
|
|
||||||
{
|
|
||||||
FunctionEmitContext ec(this, sym, function, firstStmtPos);
|
|
||||||
emitCode(&ec, function, firstStmtPos);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m->errorCount == 0) {
|
|
||||||
if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
|
|
||||||
if (g->debugPrint)
|
|
||||||
function->dump();
|
|
||||||
FATAL("Function verificication failed");
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the function is 'export'-qualified, emit a second version of
|
|
||||||
// it without a mask parameter and without name mangling so that
|
|
||||||
// the application can call it
|
|
||||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
|
||||||
assert(type != NULL);
|
|
||||||
if (type->isExported) {
|
|
||||||
if (!type->isTask) {
|
|
||||||
LLVM_TYPE_CONST llvm::FunctionType *ftype =
|
|
||||||
type->LLVMFunctionType(g->ctx);
|
|
||||||
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
|
|
||||||
std::string functionName = sym->name;
|
|
||||||
if (g->mangleFunctionsWithTarget)
|
|
||||||
functionName += std::string("_") + g->target.GetISAString();
|
|
||||||
llvm::Function *appFunction =
|
|
||||||
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
|
|
||||||
appFunction->setDoesNotThrow(true);
|
|
||||||
|
|
||||||
if (appFunction->getName() != functionName) {
|
|
||||||
// this was a redefinition for which we already emitted an
|
|
||||||
// error, so don't worry about this one...
|
|
||||||
appFunction->eraseFromParent();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// And emit the code again
|
|
||||||
FunctionEmitContext ec(this, sym, appFunction, firstStmtPos);
|
|
||||||
emitCode(&ec, appFunction, firstStmtPos);
|
|
||||||
if (m->errorCount == 0) {
|
|
||||||
sym->exportedFunction = appFunction;
|
|
||||||
if (llvm::verifyFunction(*appFunction,
|
|
||||||
llvm::ReturnStatusAction) == true) {
|
|
||||||
if (g->debugPrint)
|
|
||||||
appFunction->dump();
|
|
||||||
FATAL("Function verificication failed");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
66
func.h
66
func.h
@@ -1,66 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright (c) 2011, Intel Corporation
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the name of Intel Corporation nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived from
|
|
||||||
this software without specific prior written permission.
|
|
||||||
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
||||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
||||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
||||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
||||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/** @file func.h
|
|
||||||
@brief Representation of a function in a source file.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef ISPC_FUNC_H
|
|
||||||
#define ISPC_FUNC_H 1
|
|
||||||
|
|
||||||
#include "ispc.h"
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
class Function {
|
|
||||||
public:
|
|
||||||
Function(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code);
|
|
||||||
|
|
||||||
const Type *GetReturnType() const;
|
|
||||||
const FunctionType *GetType() const;
|
|
||||||
|
|
||||||
/** Generate LLVM IR for the function into the current module. */
|
|
||||||
void GenerateIR();
|
|
||||||
|
|
||||||
private:
|
|
||||||
void emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
|
||||||
SourcePos firstStmtPos);
|
|
||||||
|
|
||||||
Symbol *sym;
|
|
||||||
std::vector<Symbol *> args;
|
|
||||||
Stmt *code;
|
|
||||||
Symbol *maskSymbol;
|
|
||||||
Symbol *threadIndexSym, *threadCountSym;
|
|
||||||
Symbol *taskIndexSym, *taskCountSym;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // ISPC_FUNC_H
|
|
||||||
119
ispc.cpp
119
ispc.cpp
@@ -38,7 +38,6 @@
|
|||||||
#include "ispc.h"
|
#include "ispc.h"
|
||||||
#include "module.h"
|
#include "module.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
#include "llvmutil.h"
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
@@ -53,7 +52,7 @@
|
|||||||
#include <llvm/Target/TargetMachine.h>
|
#include <llvm/Target/TargetMachine.h>
|
||||||
#include <llvm/Target/TargetOptions.h>
|
#include <llvm/Target/TargetOptions.h>
|
||||||
#include <llvm/Target/TargetData.h>
|
#include <llvm/Target/TargetData.h>
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
#include <llvm/Support/TargetRegistry.h>
|
#include <llvm/Support/TargetRegistry.h>
|
||||||
#include <llvm/Support/TargetSelect.h>
|
#include <llvm/Support/TargetSelect.h>
|
||||||
#else
|
#else
|
||||||
@@ -75,7 +74,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
|||||||
if (cpu == NULL) {
|
if (cpu == NULL) {
|
||||||
std::string hostCPU = llvm::sys::getHostCPUName();
|
std::string hostCPU = llvm::sys::getHostCPUName();
|
||||||
if (hostCPU.size() > 0)
|
if (hostCPU.size() > 0)
|
||||||
cpu = strdup(hostCPU.c_str());
|
cpu = hostCPU.c_str();
|
||||||
else {
|
else {
|
||||||
fprintf(stderr, "Warning: unable to determine host CPU!\n");
|
fprintf(stderr, "Warning: unable to determine host CPU!\n");
|
||||||
cpu = "generic";
|
cpu = "generic";
|
||||||
@@ -86,7 +85,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
|||||||
if (isa == NULL) {
|
if (isa == NULL) {
|
||||||
if (!strcasecmp(cpu, "atom"))
|
if (!strcasecmp(cpu, "atom"))
|
||||||
isa = "sse2";
|
isa = "sse2";
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||||
else if (!strcasecmp(cpu, "sandybridge") ||
|
else if (!strcasecmp(cpu, "sandybridge") ||
|
||||||
!strcasecmp(cpu, "corei7-avx"))
|
!strcasecmp(cpu, "corei7-avx"))
|
||||||
isa = "avx";
|
isa = "avx";
|
||||||
@@ -130,25 +129,19 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
|||||||
t->vectorWidth = 4;
|
t->vectorWidth = 4;
|
||||||
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
||||||
}
|
}
|
||||||
else if (!strcasecmp(isa, "sse2-x2")) {
|
|
||||||
t->isa = Target::SSE2;
|
|
||||||
t->nativeVectorWidth = 4;
|
|
||||||
t->vectorWidth = 8;
|
|
||||||
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
|
||||||
}
|
|
||||||
else if (!strcasecmp(isa, "sse4")) {
|
else if (!strcasecmp(isa, "sse4")) {
|
||||||
t->isa = Target::SSE4;
|
t->isa = Target::SSE4;
|
||||||
t->nativeVectorWidth = 4;
|
t->nativeVectorWidth = 4;
|
||||||
t->vectorWidth = 4;
|
t->vectorWidth = 4;
|
||||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||||
}
|
}
|
||||||
else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
|
else if (!strcasecmp(isa, "sse4x2")) {
|
||||||
t->isa = Target::SSE4;
|
t->isa = Target::SSE4;
|
||||||
t->nativeVectorWidth = 4;
|
t->nativeVectorWidth = 4;
|
||||||
t->vectorWidth = 8;
|
t->vectorWidth = 8;
|
||||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||||
}
|
}
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
else if (!strcasecmp(isa, "avx")) {
|
else if (!strcasecmp(isa, "avx")) {
|
||||||
t->isa = Target::AVX;
|
t->isa = Target::AVX;
|
||||||
t->nativeVectorWidth = 8;
|
t->nativeVectorWidth = 8;
|
||||||
@@ -171,7 +164,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
|||||||
if (!error) {
|
if (!error) {
|
||||||
llvm::TargetMachine *targetMachine = t->GetTargetMachine();
|
llvm::TargetMachine *targetMachine = t->GetTargetMachine();
|
||||||
const llvm::TargetData *targetData = targetMachine->getTargetData();
|
const llvm::TargetData *targetData = targetMachine->getTargetData();
|
||||||
t->is32Bit = (targetData->getPointerSize() == 4);
|
t->is32bit = (targetData->getPointerSize() == 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
return !error;
|
return !error;
|
||||||
@@ -181,7 +174,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
|||||||
const char *
|
const char *
|
||||||
Target::SupportedTargetCPUs() {
|
Target::SupportedTargetCPUs() {
|
||||||
return "atom, barcelona, core2, corei7, "
|
return "atom, barcelona, core2, corei7, "
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||||
"corei7-avx, "
|
"corei7-avx, "
|
||||||
#endif
|
#endif
|
||||||
"istanbul, nocona, penryn, "
|
"istanbul, nocona, penryn, "
|
||||||
@@ -200,8 +193,8 @@ Target::SupportedTargetArchs() {
|
|||||||
|
|
||||||
const char *
|
const char *
|
||||||
Target::SupportedTargetISAs() {
|
Target::SupportedTargetISAs() {
|
||||||
return "sse2, sse2-x2, sse4, sse4-x2"
|
return "sse2, sse4, sse4x2"
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||||
", avx, avx-x2"
|
", avx, avx-x2"
|
||||||
#endif
|
#endif
|
||||||
;
|
;
|
||||||
@@ -212,11 +205,7 @@ std::string
|
|||||||
Target::GetTripleString() const {
|
Target::GetTripleString() const {
|
||||||
llvm::Triple triple;
|
llvm::Triple triple;
|
||||||
// Start with the host triple as the default
|
// Start with the host triple as the default
|
||||||
#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
|
|
||||||
triple.setTriple(llvm::sys::getDefaultTargetTriple());
|
|
||||||
#else
|
|
||||||
triple.setTriple(llvm::sys::getHostTriple());
|
triple.setTriple(llvm::sys::getHostTriple());
|
||||||
#endif
|
|
||||||
|
|
||||||
// And override the arch in the host triple based on what the user
|
// And override the arch in the host triple based on what the user
|
||||||
// specified. Here we need to deal with the fact that LLVM uses one
|
// specified. Here we need to deal with the fact that LLVM uses one
|
||||||
@@ -241,7 +230,7 @@ Target::GetTargetMachine() const {
|
|||||||
|
|
||||||
llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ :
|
llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ :
|
||||||
llvm::Reloc::Default;
|
llvm::Reloc::Default;
|
||||||
#if defined(LLVM_3_0svn) || defined(LLVM_3_1svn) || defined(LLVM_3_0)
|
#if defined(LLVM_3_0svn) || defined(LLVM_3_0)
|
||||||
std::string featuresString = attributes;
|
std::string featuresString = attributes;
|
||||||
llvm::TargetMachine *targetMachine =
|
llvm::TargetMachine *targetMachine =
|
||||||
target->createTargetMachine(triple, cpu, featuresString, relocModel);
|
target->createTargetMachine(triple, cpu, featuresString, relocModel);
|
||||||
@@ -263,53 +252,6 @@ Target::GetTargetMachine() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const char *
|
|
||||||
Target::GetISAString() const {
|
|
||||||
switch (isa) {
|
|
||||||
case Target::SSE2:
|
|
||||||
return "sse2";
|
|
||||||
case Target::SSE4:
|
|
||||||
return "sse4";
|
|
||||||
case Target::AVX:
|
|
||||||
return "avx";
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
FATAL("Unhandled target in GetISAString()");
|
|
||||||
}
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
llvm::Value *
|
|
||||||
Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
|
|
||||||
const llvm::TargetData *td = GetTargetMachine()->getTargetData();
|
|
||||||
assert(td != NULL);
|
|
||||||
uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
|
|
||||||
if (is32Bit || g->opt.force32BitAddressing)
|
|
||||||
return LLVMInt32(byteSize);
|
|
||||||
else
|
|
||||||
return LLVMInt64(byteSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
llvm::Value *
|
|
||||||
Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element) {
|
|
||||||
const llvm::TargetData *td = GetTargetMachine()->getTargetData();
|
|
||||||
assert(td != NULL);
|
|
||||||
LLVM_TYPE_CONST llvm::StructType *structType =
|
|
||||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
|
|
||||||
assert(structType != NULL);
|
|
||||||
const llvm::StructLayout *sl = td->getStructLayout(structType);
|
|
||||||
assert(sl != NULL);
|
|
||||||
|
|
||||||
uint64_t offset = sl->getElementOffset(element);
|
|
||||||
if (is32Bit || g->opt.force32BitAddressing)
|
|
||||||
return LLVMInt32(offset);
|
|
||||||
else
|
|
||||||
return LLVMInt64(offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Opt
|
// Opt
|
||||||
|
|
||||||
@@ -317,10 +259,7 @@ Opt::Opt() {
|
|||||||
level = 1;
|
level = 1;
|
||||||
fastMath = false;
|
fastMath = false;
|
||||||
fastMaskedVload = false;
|
fastMaskedVload = false;
|
||||||
force32BitAddressing = true;
|
|
||||||
unrollLoops = true;
|
unrollLoops = true;
|
||||||
disableAsserts = false;
|
|
||||||
disableHandlePseudoMemoryOps = false;
|
|
||||||
disableBlendedMaskedStores = false;
|
disableBlendedMaskedStores = false;
|
||||||
disableCoherentControlFlow = false;
|
disableCoherentControlFlow = false;
|
||||||
disableUniformControlFlow = false;
|
disableUniformControlFlow = false;
|
||||||
@@ -341,37 +280,35 @@ Globals::Globals() {
|
|||||||
runCPP = true;
|
runCPP = true;
|
||||||
debugPrint = false;
|
debugPrint = false;
|
||||||
disableWarnings = false;
|
disableWarnings = false;
|
||||||
warningsAsErrors = false;
|
|
||||||
disableLineWrap = false;
|
|
||||||
emitPerfWarnings = true;
|
emitPerfWarnings = true;
|
||||||
emitInstrumentation = false;
|
emitInstrumentation = false;
|
||||||
generateDebuggingSymbols = false;
|
generateDebuggingSymbols = false;
|
||||||
mangleFunctionsWithTarget = false;
|
|
||||||
|
|
||||||
ctx = new llvm::LLVMContext;
|
ctx = new llvm::LLVMContext;
|
||||||
|
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
_getcwd(currentDirectory, sizeof(currentDirectory));
|
_getcwd(currentDirectory, sizeof(currentDirectory));
|
||||||
#else
|
#else
|
||||||
if (getcwd(currentDirectory, sizeof(currentDirectory)) == NULL)
|
getcwd(currentDirectory, sizeof(currentDirectory));
|
||||||
FATAL("Current directory path too long!");
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// ASTNode
|
||||||
|
|
||||||
|
ASTNode::~ASTNode() {
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// SourcePos
|
// SourcePos
|
||||||
|
|
||||||
SourcePos::SourcePos(const char *n, int fl, int fc, int ll, int lc) {
|
SourcePos::SourcePos(const char *n, int l, int c) {
|
||||||
name = n ? n : m->module->getModuleIdentifier().c_str();
|
name = n ? n : m->module->getModuleIdentifier().c_str();
|
||||||
first_line = fl;
|
first_line = last_line = l;
|
||||||
first_column = fc;
|
first_column = last_column = c;
|
||||||
last_line = ll != 0 ? ll : fl;
|
|
||||||
last_column = lc != 0 ? lc : fc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llvm::DIFile SourcePos::GetDIFile() const {
|
||||||
llvm::DIFile
|
|
||||||
SourcePos::GetDIFile() const {
|
|
||||||
std::string directory, filename;
|
std::string directory, filename;
|
||||||
GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
|
GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
|
||||||
return m->diBuilder->createFile(filename, directory);
|
return m->diBuilder->createFile(filename, directory);
|
||||||
@@ -394,17 +331,3 @@ SourcePos::operator==(const SourcePos &p2) const {
|
|||||||
last_column == p2.last_column);
|
last_column == p2.last_column);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
SourcePos
|
|
||||||
Union(const SourcePos &p1, const SourcePos &p2) {
|
|
||||||
if (strcmp(p1.name, p2.name) != 0)
|
|
||||||
return p1;
|
|
||||||
|
|
||||||
SourcePos ret;
|
|
||||||
ret.name = p1.name;
|
|
||||||
ret.first_line = std::min(p1.first_line, p2.first_line);
|
|
||||||
ret.first_column = std::min(p1.first_column, p2.first_column);
|
|
||||||
ret.last_line = std::max(p1.last_line, p2.last_line);
|
|
||||||
ret.last_column = std::max(p1.last_column, p2.last_column);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|||||||
103
ispc.h
103
ispc.h
@@ -38,10 +38,6 @@
|
|||||||
#ifndef ISPC_H
|
#ifndef ISPC_H
|
||||||
#define ISPC_H
|
#define ISPC_H
|
||||||
|
|
||||||
#if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
|
|
||||||
#error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
#define ISPC_IS_WINDOWS
|
#define ISPC_IS_WINDOWS
|
||||||
#elif defined(__linux__)
|
#elif defined(__linux__)
|
||||||
@@ -80,7 +76,7 @@ namespace llvm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// llvm::Type *s are no longer const in llvm 3.0
|
// llvm::Type *s are no longer const in llvm 3.0
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
#define LLVM_TYPE_CONST
|
#define LLVM_TYPE_CONST
|
||||||
#else
|
#else
|
||||||
#define LLVM_TYPE_CONST const
|
#define LLVM_TYPE_CONST const
|
||||||
@@ -88,17 +84,19 @@ namespace llvm {
|
|||||||
|
|
||||||
class ArrayType;
|
class ArrayType;
|
||||||
class AtomicType;
|
class AtomicType;
|
||||||
|
class DeclSpecs;
|
||||||
|
class Declaration;
|
||||||
|
class Declarator;
|
||||||
class FunctionEmitContext;
|
class FunctionEmitContext;
|
||||||
class Expr;
|
class Expr;
|
||||||
class ExprList;
|
class ExprList;
|
||||||
class Function;
|
|
||||||
class FunctionType;
|
class FunctionType;
|
||||||
|
class GatherBuffer;
|
||||||
class Module;
|
class Module;
|
||||||
class Stmt;
|
class Stmt;
|
||||||
class Symbol;
|
class Symbol;
|
||||||
class SymbolTable;
|
class SymbolTable;
|
||||||
class Type;
|
class Type;
|
||||||
struct VariableDeclaration;
|
|
||||||
|
|
||||||
/** @brief Representation of a range of positions in a source file.
|
/** @brief Representation of a range of positions in a source file.
|
||||||
|
|
||||||
@@ -108,8 +106,7 @@ struct VariableDeclaration;
|
|||||||
lexing code). Both lines and columns are counted starting from one.
|
lexing code). Both lines and columns are counted starting from one.
|
||||||
*/
|
*/
|
||||||
struct SourcePos {
|
struct SourcePos {
|
||||||
SourcePos(const char *n = NULL, int fl = 0, int fc = 0,
|
SourcePos(const char *n = NULL, int l = 0, int c = 0);
|
||||||
int ll = 0, int lc = 0);
|
|
||||||
|
|
||||||
const char *name;
|
const char *name;
|
||||||
int first_line;
|
int first_line;
|
||||||
@@ -126,10 +123,37 @@ struct SourcePos {
|
|||||||
bool operator==(const SourcePos &p2) const;
|
bool operator==(const SourcePos &p2) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Returns a SourcePos that encompasses the extent of both of the given
|
|
||||||
extents. */
|
|
||||||
SourcePos Union(const SourcePos &p1, const SourcePos &p2);
|
|
||||||
|
|
||||||
|
/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
|
||||||
|
|
||||||
|
This class defines a basic interface that all abstract syntax tree
|
||||||
|
(AST) nodes must implement. The base classes for both expressions
|
||||||
|
(Expr) and statements (Stmt) inherit from this class.
|
||||||
|
*/
|
||||||
|
class ASTNode {
|
||||||
|
public:
|
||||||
|
ASTNode(SourcePos p) : pos(p) { }
|
||||||
|
virtual ~ASTNode();
|
||||||
|
|
||||||
|
/** The Optimize() method should perform any appropriate early-stage
|
||||||
|
optimizations on the node (e.g. constant folding). The caller
|
||||||
|
should use the returned ASTNode * in place of the original node.
|
||||||
|
This method may return NULL if an error is encountered during
|
||||||
|
optimization. */
|
||||||
|
virtual ASTNode *Optimize() = 0;
|
||||||
|
|
||||||
|
/** Type checking should be performed by the node when this method is
|
||||||
|
called. In the event of an error, a NULL value may be returned.
|
||||||
|
As with ASTNode::Optimize(), the caller should store the returned
|
||||||
|
pointer in place of the original ASTNode *. */
|
||||||
|
virtual ASTNode *TypeCheck() = 0;
|
||||||
|
|
||||||
|
virtual int EstimateCost() const = 0;
|
||||||
|
|
||||||
|
/** All AST nodes must track the file position where they are
|
||||||
|
defined. */
|
||||||
|
const SourcePos pos;
|
||||||
|
};
|
||||||
|
|
||||||
/** @brief Structure that defines a compilation target
|
/** @brief Structure that defines a compilation target
|
||||||
|
|
||||||
@@ -161,28 +185,13 @@ struct Target {
|
|||||||
/** Returns the LLVM TargetMachine object corresponding to this
|
/** Returns the LLVM TargetMachine object corresponding to this
|
||||||
target. */
|
target. */
|
||||||
llvm::TargetMachine *GetTargetMachine() const;
|
llvm::TargetMachine *GetTargetMachine() const;
|
||||||
|
|
||||||
/** Returns a string like "avx" encoding the target. */
|
|
||||||
const char *GetISAString() const;
|
|
||||||
|
|
||||||
/** Returns the size of the given type */
|
|
||||||
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type);
|
|
||||||
/** Given a structure type and an element number in the structure,
|
|
||||||
returns a value corresponding to the number of bytes from the start
|
|
||||||
of the structure where the element is located. */
|
|
||||||
llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
|
|
||||||
int element);
|
|
||||||
|
|
||||||
/** llvm Target object representing this target. */
|
/** llvm Target object representing this target. */
|
||||||
const llvm::Target *target;
|
const llvm::Target *target;
|
||||||
|
|
||||||
/** Enumerator giving the instruction sets that the compiler can
|
/** Enumerator giving the instruction sets that the compiler can
|
||||||
target. These should be ordered from "worse" to "better" in that
|
target. */
|
||||||
if a processor supports multiple target ISAs, then the most
|
enum ISA { SSE2, SSE4, AVX };
|
||||||
flexible/performant of them will apear last in the enumerant. Note
|
|
||||||
also that __best_available_isa() needs to be updated if ISAs are
|
|
||||||
added or the enumerant values are reordered. */
|
|
||||||
enum ISA { SSE2, SSE4, AVX, NUM_ISAS };
|
|
||||||
|
|
||||||
/** Instruction set being compiled to. */
|
/** Instruction set being compiled to. */
|
||||||
ISA isa;
|
ISA isa;
|
||||||
@@ -191,7 +200,7 @@ struct Target {
|
|||||||
std::string arch;
|
std::string arch;
|
||||||
|
|
||||||
/** Is the target architecture 32 or 64 bit */
|
/** Is the target architecture 32 or 64 bit */
|
||||||
bool is32Bit;
|
bool is32bit;
|
||||||
|
|
||||||
/** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
|
/** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
|
||||||
std::string cpu;
|
std::string cpu;
|
||||||
@@ -241,22 +250,6 @@ struct Opt {
|
|||||||
it will make sense. */
|
it will make sense. */
|
||||||
bool unrollLoops;
|
bool unrollLoops;
|
||||||
|
|
||||||
/** Indicates if addressing math will be done with 32-bit math, even on
|
|
||||||
64-bit systems. (This is generally noticably more efficient,
|
|
||||||
though at the cost of addressing >2GB).
|
|
||||||
*/
|
|
||||||
bool force32BitAddressing;
|
|
||||||
|
|
||||||
/** Indicates whether assert() statements should be ignored (for
|
|
||||||
performance in the generated code). */
|
|
||||||
bool disableAsserts;
|
|
||||||
|
|
||||||
/** If enabled, the various __pseudo* memory ops (gather/scatter,
|
|
||||||
masked load/store) are left in their __pseudo* form, for better
|
|
||||||
understanding of the structure of generated code when reading
|
|
||||||
it. */
|
|
||||||
bool disableHandlePseudoMemoryOps;
|
|
||||||
|
|
||||||
/** On targets that don't have a masked store instruction but do have a
|
/** On targets that don't have a masked store instruction but do have a
|
||||||
blending instruction, by default, we simulate masked stores by
|
blending instruction, by default, we simulate masked stores by
|
||||||
loading the old value, blending, and storing the result. This can
|
loading the old value, blending, and storing the result. This can
|
||||||
@@ -348,13 +341,6 @@ struct Globals {
|
|||||||
/** Indicates whether all warning messages should be surpressed. */
|
/** Indicates whether all warning messages should be surpressed. */
|
||||||
bool disableWarnings;
|
bool disableWarnings;
|
||||||
|
|
||||||
/** Indicates whether warnings should be issued as errors. */
|
|
||||||
bool warningsAsErrors;
|
|
||||||
|
|
||||||
/** Indicates whether line wrapping of error messages to the terminal
|
|
||||||
width should be disabled. */
|
|
||||||
bool disableLineWrap;
|
|
||||||
|
|
||||||
/** Indicates whether additional warnings should be issued about
|
/** Indicates whether additional warnings should be issued about
|
||||||
possible performance pitfalls. */
|
possible performance pitfalls. */
|
||||||
bool emitPerfWarnings;
|
bool emitPerfWarnings;
|
||||||
@@ -368,10 +354,6 @@ struct Globals {
|
|||||||
/** Indicates whether ispc should generate debugging symbols for the
|
/** Indicates whether ispc should generate debugging symbols for the
|
||||||
program in its output. */
|
program in its output. */
|
||||||
bool generateDebuggingSymbols;
|
bool generateDebuggingSymbols;
|
||||||
|
|
||||||
/** If true, function names are mangled by appending the target ISA and
|
|
||||||
vector width to them. */
|
|
||||||
bool mangleFunctionsWithTarget;
|
|
||||||
|
|
||||||
/** Global LLVMContext object */
|
/** Global LLVMContext object */
|
||||||
llvm::LLVMContext *ctx;
|
llvm::LLVMContext *ctx;
|
||||||
@@ -391,8 +373,6 @@ enum {
|
|||||||
COST_COMPLEX_ARITH_OP = 4,
|
COST_COMPLEX_ARITH_OP = 4,
|
||||||
COST_DEREF = 4,
|
COST_DEREF = 4,
|
||||||
COST_FUNCALL = 4,
|
COST_FUNCALL = 4,
|
||||||
COST_FUNPTR_UNIFORM = 12,
|
|
||||||
COST_FUNPTR_VARYING = 24,
|
|
||||||
COST_GATHER = 8,
|
COST_GATHER = 8,
|
||||||
COST_LOAD = 2,
|
COST_LOAD = 2,
|
||||||
COST_REGULAR_BREAK_CONTINUE = 2,
|
COST_REGULAR_BREAK_CONTINUE = 2,
|
||||||
@@ -400,14 +380,11 @@ enum {
|
|||||||
COST_SELECT = 4,
|
COST_SELECT = 4,
|
||||||
COST_SIMPLE_ARITH_LOGIC_OP = 1,
|
COST_SIMPLE_ARITH_LOGIC_OP = 1,
|
||||||
COST_SYNC = 32,
|
COST_SYNC = 32,
|
||||||
COST_TASK_LAUNCH = 32,
|
COST_TASK_LAUNCH = 16,
|
||||||
COST_TYPECAST_COMPLEX = 4,
|
COST_TYPECAST_COMPLEX = 4,
|
||||||
COST_TYPECAST_SIMPLE = 1,
|
COST_TYPECAST_SIMPLE = 1,
|
||||||
COST_UNIFORM_IF = 2,
|
|
||||||
COST_VARYING_IF = 3,
|
|
||||||
COST_UNIFORM_LOOP = 4,
|
COST_UNIFORM_LOOP = 4,
|
||||||
COST_VARYING_LOOP = 6,
|
COST_VARYING_LOOP = 6,
|
||||||
COST_ASSERT = 8,
|
|
||||||
|
|
||||||
CHECK_MASK_AT_FUNCTION_START_COST = 16,
|
CHECK_MASK_AT_FUNCTION_START_COST = 16,
|
||||||
PREDICATE_SAFE_IF_STATEMENT_COST = 6,
|
PREDICATE_SAFE_IF_STATEMENT_COST = 6,
|
||||||
|
|||||||
78
ispc.vcxproj
78
ispc.vcxproj
@@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup Label="ProjectConfigurations">
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
<ProjectConfiguration Include="Debug|Win32">
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
@@ -11,35 +11,25 @@
|
|||||||
</ProjectConfiguration>
|
</ProjectConfiguration>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="ast.cpp" />
|
|
||||||
<ClCompile Include="builtins.cpp" />
|
<ClCompile Include="builtins.cpp" />
|
||||||
<ClCompile Include="ctx.cpp" />
|
<ClCompile Include="ctx.cpp" />
|
||||||
<ClCompile Include="decl.cpp" />
|
<ClCompile Include="decl.cpp" />
|
||||||
<ClCompile Include="expr.cpp" />
|
<ClCompile Include="expr.cpp" />
|
||||||
<ClCompile Include="func.cpp" />
|
|
||||||
<ClCompile Include="gen-bitcode-avx.cpp" />
|
<ClCompile Include="gen-bitcode-avx.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
|
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-dispatch.cpp" />
|
|
||||||
<ClCompile Include="gen-bitcode-sse2.cpp" />
|
<ClCompile Include="gen-bitcode-sse2.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-sse2-x2.cpp" />
|
|
||||||
<ClCompile Include="gen-bitcode-sse4.cpp" />
|
<ClCompile Include="gen-bitcode-sse4.cpp" />
|
||||||
<ClCompile Include="gen-bitcode-sse4-x2.cpp" />
|
<ClCompile Include="gen-bitcode-sse4x2.cpp" />
|
||||||
<ClCompile Include="gen-stdlib.cpp" />
|
<ClCompile Include="gen-stdlib.cpp" />
|
||||||
<ClCompile Include="ispc.cpp" />
|
<ClCompile Include="ispc.cpp" />
|
||||||
<ClCompile Include="lex.cc">
|
<ClCompile Include="lex.cc" />
|
||||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
|
|
||||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
|
|
||||||
</ClCompile>
|
|
||||||
<ClCompile Include="llvmutil.cpp" />
|
<ClCompile Include="llvmutil.cpp" />
|
||||||
<ClCompile Include="module.cpp" />
|
<ClCompile Include="module.cpp" />
|
||||||
<ClCompile Include="main.cpp" />
|
<ClCompile Include="main.cpp" />
|
||||||
<ClCompile Include="opt.cpp" />
|
<ClCompile Include="opt.cpp" />
|
||||||
<ClCompile Include="parse.cc">
|
<ClCompile Include="parse.cc" />
|
||||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
|
|
||||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
|
|
||||||
</ClCompile>
|
|
||||||
<CustomBuild Include="builtins-c.c">
|
<CustomBuild Include="builtins-c.c">
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
||||||
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
||||||
@@ -56,12 +46,10 @@
|
|||||||
<ClCompile Include="util.cpp" />
|
<ClCompile Include="util.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClInclude Include="ast.h" />
|
|
||||||
<ClInclude Include="builtins.h" />
|
<ClInclude Include="builtins.h" />
|
||||||
<ClInclude Include="ctx.h" />
|
<ClInclude Include="ctx.h" />
|
||||||
<ClInclude Include="decl.h" />
|
<ClInclude Include="decl.h" />
|
||||||
<ClInclude Include="expr.h" />
|
<ClInclude Include="expr.h" />
|
||||||
<ClInclude Include="func.h" />
|
|
||||||
<ClInclude Include="ispc.h" />
|
<ClInclude Include="ispc.h" />
|
||||||
<ClInclude Include="llvmutil.h" />
|
<ClInclude Include="llvmutil.h" />
|
||||||
<ClInclude Include="module.h" />
|
<ClInclude Include="module.h" />
|
||||||
@@ -88,38 +76,25 @@
|
|||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="builtins-dispatch.ll">
|
<CustomBuild Include="builtins-sse4x2.ll">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||||
</CustomBuild>
|
|
||||||
</ItemGroup>
|
|
||||||
<ItemGroup>
|
|
||||||
<CustomBuild Include="builtins-sse4-x2.ll">
|
|
||||||
<FileType>Document</FileType>
|
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp</Command>
|
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
|
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
|
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp</Command>
|
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
|
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
|
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
|
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
|
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
@@ -127,36 +102,23 @@
|
|||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
|
||||||
<CustomBuild Include="builtins-sse2-x2.ll">
|
|
||||||
<FileType>Document</FileType>
|
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp</Command>
|
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
|
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
|
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp</Command>
|
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
|
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
|
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
|
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
|
|
||||||
</CustomBuild>
|
|
||||||
</ItemGroup>
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<CustomBuild Include="builtins-avx.ll">
|
<CustomBuild Include="builtins-avx.ll">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
|
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
|
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
@@ -268,4 +230,4 @@
|
|||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ extern "C" {
|
|||||||
#include <llvm/DerivedTypes.h>
|
#include <llvm/DerivedTypes.h>
|
||||||
#include <llvm/Instructions.h>
|
#include <llvm/Instructions.h>
|
||||||
#include <llvm/ExecutionEngine/ExecutionEngine.h>
|
#include <llvm/ExecutionEngine/ExecutionEngine.h>
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
#include <llvm/Support/TargetRegistry.h>
|
#include <llvm/Support/TargetRegistry.h>
|
||||||
#include <llvm/Support/TargetSelect.h>
|
#include <llvm/Support/TargetSelect.h>
|
||||||
#else
|
#else
|
||||||
@@ -120,7 +120,7 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
|
|||||||
*handle = (void *)0xdeadbeef;
|
*handle = (void *)0xdeadbeef;
|
||||||
// leak time!
|
// leak time!
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
return _aligned_malloc((size_t)size, alignment);
|
return _aligned_malloc(size, alignment);
|
||||||
#endif
|
#endif
|
||||||
#ifdef ISPC_IS_LINUX
|
#ifdef ISPC_IS_LINUX
|
||||||
return memalign(alignment, size);
|
return memalign(alignment, size);
|
||||||
@@ -182,7 +182,7 @@ static bool lRunTest(const char *fn) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::string eeError;
|
std::string eeError;
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
llvm::EngineBuilder engineBuilder(module);
|
llvm::EngineBuilder engineBuilder(module);
|
||||||
engineBuilder.setErrorStr(&eeError);
|
engineBuilder.setErrorStr(&eeError);
|
||||||
engineBuilder.setEngineKind(llvm::EngineKind::JIT);
|
engineBuilder.setEngineKind(llvm::EngineKind::JIT);
|
||||||
@@ -361,7 +361,7 @@ static bool lRunTest(const char *fn) {
|
|||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
llvm::InitializeNativeTarget();
|
llvm::InitializeNativeTarget();
|
||||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||||
LLVMLinkInJIT();
|
LLVMLinkInJIT();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -54,7 +54,6 @@
|
|||||||
<Optimization>Disabled</Optimization>
|
<Optimization>Disabled</Optimization>
|
||||||
<PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||||
<DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
@@ -73,7 +72,6 @@
|
|||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||||
<DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
|
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
<SubSystem>Console</SubSystem>
|
<SubSystem>Console</SubSystem>
|
||||||
|
|||||||
18
lex.ll
18
lex.ll
@@ -34,13 +34,13 @@
|
|||||||
%{
|
%{
|
||||||
|
|
||||||
#include "ispc.h"
|
#include "ispc.h"
|
||||||
|
#include "decl.h"
|
||||||
#include "sym.h"
|
#include "sym.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
#include "module.h"
|
#include "module.h"
|
||||||
#include "type.h"
|
#include "type.h"
|
||||||
#include "parse.hh"
|
#include "parse.hh"
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
static uint64_t lParseBinary(const char *ptr, SourcePos pos);
|
static uint64_t lParseBinary(const char *ptr, SourcePos pos);
|
||||||
static void lCComment(SourcePos *);
|
static void lCComment(SourcePos *);
|
||||||
@@ -78,7 +78,6 @@ ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
|
|||||||
"/*" { lCComment(yylloc); }
|
"/*" { lCComment(yylloc); }
|
||||||
"//" { lCppComment(yylloc); }
|
"//" { lCppComment(yylloc); }
|
||||||
|
|
||||||
__assert { return TOKEN_ASSERT; }
|
|
||||||
bool { return TOKEN_BOOL; }
|
bool { return TOKEN_BOOL; }
|
||||||
break { return TOKEN_BREAK; }
|
break { return TOKEN_BREAK; }
|
||||||
case { return TOKEN_CASE; }
|
case { return TOKEN_CASE; }
|
||||||
@@ -86,6 +85,7 @@ cbreak { return TOKEN_CBREAK; }
|
|||||||
ccontinue { return TOKEN_CCONTINUE; }
|
ccontinue { return TOKEN_CCONTINUE; }
|
||||||
cdo { return TOKEN_CDO; }
|
cdo { return TOKEN_CDO; }
|
||||||
cfor { return TOKEN_CFOR; }
|
cfor { return TOKEN_CFOR; }
|
||||||
|
char { return TOKEN_CHAR; }
|
||||||
cif { return TOKEN_CIF; }
|
cif { return TOKEN_CIF; }
|
||||||
cwhile { return TOKEN_CWHILE; }
|
cwhile { return TOKEN_CWHILE; }
|
||||||
const { return TOKEN_CONST; }
|
const { return TOKEN_CONST; }
|
||||||
@@ -101,8 +101,6 @@ extern { return TOKEN_EXTERN; }
|
|||||||
false { return TOKEN_FALSE; }
|
false { return TOKEN_FALSE; }
|
||||||
float { return TOKEN_FLOAT; }
|
float { return TOKEN_FLOAT; }
|
||||||
for { return TOKEN_FOR; }
|
for { return TOKEN_FOR; }
|
||||||
foreach { return TOKEN_FOREACH; }
|
|
||||||
foreach_tiled { return TOKEN_FOREACH_TILED; }
|
|
||||||
goto { return TOKEN_GOTO; }
|
goto { return TOKEN_GOTO; }
|
||||||
if { return TOKEN_IF; }
|
if { return TOKEN_IF; }
|
||||||
inline { return TOKEN_INLINE; }
|
inline { return TOKEN_INLINE; }
|
||||||
@@ -112,15 +110,10 @@ int16 { return TOKEN_INT16; }
|
|||||||
int32 { return TOKEN_INT; }
|
int32 { return TOKEN_INT; }
|
||||||
int64 { return TOKEN_INT64; }
|
int64 { return TOKEN_INT64; }
|
||||||
launch { return TOKEN_LAUNCH; }
|
launch { return TOKEN_LAUNCH; }
|
||||||
NULL { return TOKEN_NULL; }
|
|
||||||
print { return TOKEN_PRINT; }
|
print { return TOKEN_PRINT; }
|
||||||
reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
|
reference { return TOKEN_REFERENCE; }
|
||||||
"please use C++-style '&' syntax for references "
|
|
||||||
"instead."); }
|
|
||||||
return { return TOKEN_RETURN; }
|
return { return TOKEN_RETURN; }
|
||||||
soa { return TOKEN_SOA; }
|
soa { return TOKEN_SOA; }
|
||||||
signed { return TOKEN_SIGNED; }
|
|
||||||
sizeof { return TOKEN_SIZEOF; }
|
|
||||||
static { return TOKEN_STATIC; }
|
static { return TOKEN_STATIC; }
|
||||||
struct { return TOKEN_STRUCT; }
|
struct { return TOKEN_STRUCT; }
|
||||||
switch { return TOKEN_SWITCH; }
|
switch { return TOKEN_SWITCH; }
|
||||||
@@ -133,8 +126,6 @@ unsigned { return TOKEN_UNSIGNED; }
|
|||||||
varying { return TOKEN_VARYING; }
|
varying { return TOKEN_VARYING; }
|
||||||
void { return TOKEN_VOID; }
|
void { return TOKEN_VOID; }
|
||||||
while { return TOKEN_WHILE; }
|
while { return TOKEN_WHILE; }
|
||||||
\"C\" { return TOKEN_STRING_C_LITERAL; }
|
|
||||||
\.\.\. { return TOKEN_DOTDOTDOT; }
|
|
||||||
|
|
||||||
L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
|
L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
|
||||||
|
|
||||||
@@ -230,7 +221,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
|||||||
"&=" { return TOKEN_AND_ASSIGN; }
|
"&=" { return TOKEN_AND_ASSIGN; }
|
||||||
"^=" { return TOKEN_XOR_ASSIGN; }
|
"^=" { return TOKEN_XOR_ASSIGN; }
|
||||||
"|=" { return TOKEN_OR_ASSIGN; }
|
"|=" { return TOKEN_OR_ASSIGN; }
|
||||||
"->" { return TOKEN_PTR_OP; }
|
|
||||||
";" { return ';'; }
|
";" { return ';'; }
|
||||||
("{"|"<%") { return '{'; }
|
("{"|"<%") { return '{'; }
|
||||||
("}"|"%>") { return '}'; }
|
("}"|"%>") { return '}'; }
|
||||||
@@ -274,6 +264,8 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
|||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
|
/*sizeof { return TOKEN_SIZEOF; }*/
|
||||||
|
/*"->" { return TOKEN_PTR_OP; }*/
|
||||||
/*short { return TOKEN_SHORT; }*/
|
/*short { return TOKEN_SHORT; }*/
|
||||||
/*long { return TOKEN_LONG; }*/
|
/*long { return TOKEN_LONG; }*/
|
||||||
/*signed { return TOKEN_SIGNED; }*/
|
/*signed { return TOKEN_SIGNED; }*/
|
||||||
|
|||||||
17
llvmutil.cpp
17
llvmutil.cpp
@@ -40,7 +40,6 @@
|
|||||||
|
|
||||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
|
LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
|
||||||
LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
|
LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
|
||||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::PointerIntType = NULL;
|
|
||||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
|
LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
|
||||||
|
|
||||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
|
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
|
||||||
@@ -75,7 +74,7 @@ LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
|
|||||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
|
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
|
||||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
|
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
|
||||||
|
|
||||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;
|
LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
|
||||||
|
|
||||||
llvm::Constant *LLVMTrue = NULL;
|
llvm::Constant *LLVMTrue = NULL;
|
||||||
llvm::Constant *LLVMFalse = NULL;
|
llvm::Constant *LLVMFalse = NULL;
|
||||||
@@ -87,8 +86,6 @@ void
|
|||||||
InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||||
LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
|
LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
|
||||||
LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
|
LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
|
||||||
LLVMTypes::PointerIntType = target.is32Bit ? llvm::Type::getInt32Ty(*ctx) :
|
|
||||||
llvm::Type::getInt64Ty(*ctx);
|
|
||||||
|
|
||||||
LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
|
LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
|
||||||
LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
|
LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
|
||||||
@@ -133,8 +130,8 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
|||||||
LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
|
LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
|
||||||
LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
|
LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
|
||||||
|
|
||||||
LLVMTypes::VoidPointerVectorType = g->target.is32Bit ? LLVMTypes::Int32VectorType :
|
LLVMTypes::VoidPointerVectorType =
|
||||||
LLVMTypes::Int64VectorType;
|
llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
|
||||||
|
|
||||||
LLVMTrue = llvm::ConstantInt::getTrue(*ctx);
|
LLVMTrue = llvm::ConstantInt::getTrue(*ctx);
|
||||||
LLVMFalse = llvm::ConstantInt::getFalse(*ctx);
|
LLVMFalse = llvm::ConstantInt::getFalse(*ctx);
|
||||||
@@ -454,3 +451,11 @@ LLVMBoolVector(const bool *bvec) {
|
|||||||
}
|
}
|
||||||
return llvm::ConstantVector::get(vals);
|
return llvm::ConstantVector::get(vals);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
LLVM_TYPE_CONST llvm::ArrayType *
|
||||||
|
LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t) {
|
||||||
|
// NOTE: ArrayType, not VectorType
|
||||||
|
return llvm::ArrayType::get(llvm::PointerType::get(t, 0),
|
||||||
|
g->target.vectorWidth);
|
||||||
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user