Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2f35bc1a0f | ||
|
|
1620e0508d |
20
Makefile
20
Makefile
@@ -25,8 +25,7 @@ BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
|
||||
|
||||
CXX=g++
|
||||
CPP=cpp
|
||||
OPT=-g3
|
||||
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
|
||||
CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
|
||||
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
|
||||
|
||||
LDFLAGS=
|
||||
@@ -45,13 +44,13 @@ YACC=bison -d -v -t
|
||||
|
||||
###########################################################################
|
||||
|
||||
CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
|
||||
CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
|
||||
llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
|
||||
util.cpp
|
||||
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
|
||||
HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
|
||||
builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
|
||||
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
|
||||
builtins-sse4.ll builtins-sse4x2.ll
|
||||
BISON_SRC=parse.yy
|
||||
FLEX_SRC=lex.ll
|
||||
|
||||
@@ -112,7 +111,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-%.cpp: builtins-%.ll
|
||||
objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
|
||||
@echo Creating C++ source from builtin definitions file $<
|
||||
@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
|
||||
|
||||
@@ -143,10 +142,3 @@ objs/stdlib_ispc.cpp: stdlib.ispc
|
||||
objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
|
||||
objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
|
||||
objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
|
||||
objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
|
||||
objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
|
||||
objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll
|
||||
|
||||
94
ast.h
94
ast.h
@@ -1,94 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file ast.h
|
||||
@brief
|
||||
*/
|
||||
|
||||
#ifndef ISPC_AST_H
|
||||
#define ISPC_AST_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <vector>
|
||||
|
||||
/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
|
||||
|
||||
This class defines a basic interface that all abstract syntax tree
|
||||
(AST) nodes must implement. The base classes for both expressions
|
||||
(Expr) and statements (Stmt) inherit from this class.
|
||||
*/
|
||||
class ASTNode {
|
||||
public:
|
||||
ASTNode(SourcePos p) : pos(p) { }
|
||||
virtual ~ASTNode();
|
||||
|
||||
/** The Optimize() method should perform any appropriate early-stage
|
||||
optimizations on the node (e.g. constant folding). The caller
|
||||
should use the returned ASTNode * in place of the original node.
|
||||
This method may return NULL if an error is encountered during
|
||||
optimization. */
|
||||
virtual ASTNode *Optimize() = 0;
|
||||
|
||||
/** Type checking should be performed by the node when this method is
|
||||
called. In the event of an error, a NULL value may be returned.
|
||||
As with ASTNode::Optimize(), the caller should store the returned
|
||||
pointer in place of the original ASTNode *. */
|
||||
virtual ASTNode *TypeCheck() = 0;
|
||||
|
||||
virtual int EstimateCost() const = 0;
|
||||
|
||||
/** All AST nodes must track the file position where they are
|
||||
defined. */
|
||||
SourcePos pos;
|
||||
};
|
||||
|
||||
|
||||
/** Simple representation of the abstract syntax trees for all of the
|
||||
functions declared in a compilation unit.
|
||||
*/
|
||||
class AST {
|
||||
public:
|
||||
/** Add the AST for a function described by the given declaration
|
||||
information and source code. */
|
||||
void AddFunction(Symbol *sym, const std::vector<Symbol *> &args,
|
||||
Stmt *code);
|
||||
|
||||
/** Generate LLVM IR for all of the functions into the current
|
||||
module. */
|
||||
void GenerateIR();
|
||||
|
||||
private:
|
||||
std::vector<Function *> functions;
|
||||
};
|
||||
|
||||
#endif // ISPC_AST_H
|
||||
16
buildall.bat
16
buildall.bat
@@ -1,16 +0,0 @@
|
||||
@echo off
|
||||
|
||||
REM If LLVM_INSTALL_DIR isn't set globally in your environment,
|
||||
REM it can be set here_
|
||||
set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
|
||||
|
||||
REM Both the LLVM binaries and python need to be in the path
|
||||
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
|
||||
|
||||
msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
|
||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
|
||||
@@ -30,14 +30,18 @@
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; AVX target implementation.
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
@@ -56,7 +60,7 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
@@ -79,7 +83,7 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
@@ -88,7 +92,7 @@ define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
@@ -102,14 +106,14 @@ define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
@@ -118,7 +122,7 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
@@ -133,7 +137,7 @@ define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
@@ -154,7 +158,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
@@ -166,7 +170,7 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
@@ -185,12 +189,12 @@ define void @__fastmath() nounwind alwaysinline {
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
@@ -202,12 +206,12 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
@@ -219,12 +223,12 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
@@ -234,14 +238,14 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
@@ -251,7 +255,7 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
|
||||
declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
@@ -263,12 +267,12 @@ define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
@@ -29,6 +29,13 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 16-wide definitions
|
||||
|
||||
@@ -44,7 +51,7 @@ include(`builtins-avx-common.ll')
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
@@ -64,17 +71,17 @@ define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysi
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
round8to16(%0, 8)
|
||||
}
|
||||
|
||||
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round8to16(%0, 9)
|
||||
}
|
||||
|
||||
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round8to16(%0, 10)
|
||||
}
|
||||
@@ -84,15 +91,15 @@ define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly always
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||
|
||||
define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 8)
|
||||
}
|
||||
|
||||
define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 9)
|
||||
}
|
||||
|
||||
define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
round4to16double(%0, 10)
|
||||
}
|
||||
|
||||
@@ -102,7 +109,7 @@ define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alw
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||
define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
@@ -125,7 +132,7 @@ define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly al
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
@@ -153,13 +160,13 @@ declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
|
||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define <16 x float> @__max_varying_float(<16 x float>,
|
||||
define internal <16 x float> @__max_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
}
|
||||
|
||||
define <16 x float> @__min_varying_float(<16 x float>,
|
||||
define internal <16 x float> @__min_varying_float(<16 x float>,
|
||||
<16 x float>) nounwind readonly alwaysinline {
|
||||
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
|
||||
ret <16 x float> %call
|
||||
@@ -169,12 +176,12 @@ define <16 x float> @__min_varying_float(<16 x float>,
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
@@ -183,12 +190,12 @@ define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
@@ -198,7 +205,7 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <16 x i32> %0 to <16 x float>
|
||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
@@ -217,7 +224,7 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
%va = shufflevector <16 x float> %0, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vb = shufflevector <16 x float> %0, <16 x float> undef,
|
||||
@@ -232,12 +239,12 @@ define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
}
|
||||
|
||||
|
||||
define float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
reduce16(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
define float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
|
||||
reduce16(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
@@ -246,28 +253,28 @@ reduce_equal(16)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
define <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||
define internal <16 x i32> @__add_varying_int32(<16 x i32>,
|
||||
<16 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i32> %0, %1
|
||||
ret <16 x i32> %s
|
||||
}
|
||||
|
||||
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%s = add i32 %0, %1
|
||||
ret i32 %s
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
@@ -275,17 +282,17 @@ define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<16 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
@@ -295,7 +302,7 @@ define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
|
||||
%va = shufflevector <16 x double> %0, <16 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%vb = shufflevector <16 x double> %0, <16 x double> undef,
|
||||
@@ -315,12 +322,12 @@ define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
reduce16(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
|
||||
reduce16(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
@@ -328,28 +335,28 @@ define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int64 ops
|
||||
|
||||
define <16 x i64> @__add_varying_int64(<16 x i64>,
|
||||
define internal <16 x i64> @__add_varying_int64(<16 x i64>,
|
||||
<16 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <16 x i64> %0, %1
|
||||
ret <16 x i64> %s
|
||||
}
|
||||
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%s = add i64 %0, %1
|
||||
ret i64 %s
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
@@ -357,17 +364,17 @@ define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<16 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
|
||||
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
@@ -635,7 +642,7 @@ gen_scatter(16, i64)
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||
|
||||
define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
|
||||
define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
|
||||
unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
@@ -647,12 +654,12 @@ define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline
|
||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
|
||||
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
@@ -29,6 +29,13 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; *** Untested *** AVX target implementation.
|
||||
;;
|
||||
;; The LLVM AVX code generator is incomplete, so the ispc AVX target
|
||||
;; hasn't yet been tested. There is therefore a higher-than-normal
|
||||
;; chance that there are bugs in the code in this file.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Basic 8-wide definitions
|
||||
|
||||
@@ -44,7 +51,7 @@ include(`builtins-avx-common.ll')
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
@@ -62,19 +69,19 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
|
||||
ret <8 x float> %call
|
||||
@@ -85,17 +92,17 @@ define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysin
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
|
||||
|
||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round4to8double(%0, 8)
|
||||
}
|
||||
|
||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
|
||||
round4to8double(%0, 9)
|
||||
}
|
||||
|
||||
|
||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
|
||||
round4to8double(%0, 10)
|
||||
}
|
||||
@@ -106,7 +113,7 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
@@ -125,7 +132,7 @@ define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwa
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
@@ -153,13 +160,13 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
|
||||
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__max_varying_float(<8 x float>,
|
||||
define internal <8 x float> @__max_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__min_varying_float(<8 x float>,
|
||||
define internal <8 x float> @__min_varying_float(<8 x float>,
|
||||
<8 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
|
||||
ret <8 x float> %call
|
||||
@@ -169,12 +176,12 @@ define <8 x float> @__min_varying_float(<8 x float>,
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
@@ -183,12 +190,12 @@ define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly al
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
@@ -198,7 +205,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
|
||||
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
@@ -209,7 +216,7 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
|
||||
|
||||
define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||
@@ -219,12 +226,12 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
}
|
||||
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
|
||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
@@ -233,28 +240,28 @@ reduce_equal(8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int32 ops
|
||||
|
||||
define <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||
define internal <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i32> %0, %1
|
||||
ret <8 x i32> %s
|
||||
}
|
||||
|
||||
define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%s = add i32 %0, %1
|
||||
ret i32 %s
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
@@ -262,17 +269,17 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint32 ops
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
@@ -282,7 +289,7 @@ define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
%v0 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||
@@ -296,12 +303,12 @@ define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
@@ -309,28 +316,28 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; horizontal int64 ops
|
||||
|
||||
define <8 x i64> @__add_varying_int64(<8 x i64>,
|
||||
define internal <8 x i64> @__add_varying_int64(<8 x i64>,
|
||||
<8 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i64> %0, %1
|
||||
ret <8 x i64> %s
|
||||
}
|
||||
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%s = add i64 %0, %1
|
||||
ret i64 %s
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
@@ -338,17 +345,17 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; horizontal uint64 ops
|
||||
|
||||
define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
|
||||
%r = call i64 @__reduce_add_int64(<8 x i64> %v)
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
|
||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
@@ -533,7 +540,7 @@ gen_scatter(8, i64)
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
@@ -545,12 +552,12 @@ define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
27
builtins-c.c
27
builtins-c.c
@@ -57,7 +57,6 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
typedef int Bool;
|
||||
@@ -133,8 +132,6 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||
case 'd': PRINT_SCALAR("%f", double);
|
||||
case 'D': PRINT_VECTOR("%f", double);
|
||||
case 'p': PRINT_SCALAR("%p", void *);
|
||||
case 'P': PRINT_VECTOR("%p", void *);
|
||||
default:
|
||||
printf("UNKNOWN TYPE ");
|
||||
putchar(*types);
|
||||
@@ -150,21 +147,21 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
|
||||
int __num_cores() {
|
||||
#ifdef _MSC_VER
|
||||
// This is quite a hack. Including all of windows.h to get this definition
|
||||
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||
// here. Presumably this struct declaration won't be changing in the future
|
||||
// anyway...
|
||||
struct SYSTEM_INFO {
|
||||
int pad0[2];
|
||||
void *pad1[2];
|
||||
int *pad2;
|
||||
int dwNumberOfProcessors;
|
||||
// This is quite a hack. Including all of windows.h to get this definition
|
||||
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||
// here. Presumably this struct declaration won't be changing in the future
|
||||
// anyway...
|
||||
struct SYSTEM_INFO {
|
||||
int pad0[2];
|
||||
void *pad1[2];
|
||||
int *pad2;
|
||||
int dwNumberOfProcessors;
|
||||
int pad3[3];
|
||||
};
|
||||
};
|
||||
|
||||
struct SYSTEM_INFO sysInfo;
|
||||
extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
|
||||
extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
|
||||
GetSystemInfo(&sysInfo);
|
||||
return sysInfo.dwNumberOfProcessors;
|
||||
#else
|
||||
|
||||
@@ -1,123 +0,0 @@
|
||||
;; Copyright (c) 2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;; This file defines various functions that are used when generating the
|
||||
;; the "dispatch" object/assembly file that has entrypoints for each
|
||||
;; exported function in a module that dispatch to the best available
|
||||
;; variant of that function that will run on the system's CPU.
|
||||
|
||||
;; Stores the best target ISA that the system on which we're actually
|
||||
;; running supports. -1 represents "uninitialized", otherwise this value
|
||||
;; should correspond to one of the enumerant values of Target::ISA from
|
||||
;; ispc.h.
|
||||
|
||||
@__system_best_isa = internal global i32 -1
|
||||
|
||||
declare void @abort() noreturn
|
||||
|
||||
;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
|
||||
;; following code... Specifically, __get_system_isa should return a value
|
||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
||||
;; most capable ISA that the curremt system can run.
|
||||
;;
|
||||
;; #ifdef _MSC_VER
|
||||
;; extern void __stdcall __cpuid(int info[4], int infoType);
|
||||
;; #else
|
||||
;; static void __cpuid(int info[4], int infoType) {
|
||||
;; __asm__ __volatile__ ("cpuid"
|
||||
;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (infoType));
|
||||
;; }
|
||||
;; #endif
|
||||
;;
|
||||
;; int32_t __get_system_isa() {
|
||||
;; int info[4];
|
||||
;; __cpuid(info, 1);
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0)
|
||||
;; return 2; // AVX
|
||||
;; else if ((info[2] & (1 << 19)) != 0)
|
||||
;; return 1; // SSE4
|
||||
;; else if ((info[3] & (1 << 26)) != 0)
|
||||
;; return 0; // SSE2
|
||||
;; else
|
||||
;; abort();
|
||||
;; }
|
||||
|
||||
%0 = type { i32, i32, i32, i32 }
|
||||
|
||||
define i32 @__get_system_isa() nounwind ssp {
|
||||
%1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%2 = extractvalue %0 %1, 2
|
||||
%3 = extractvalue %0 %1, 3
|
||||
%4 = and i32 %2, 268435456
|
||||
%5 = icmp eq i32 %4, 0
|
||||
br i1 %5, label %6, label %13
|
||||
|
||||
; <label>:6 ; preds = %0
|
||||
%7 = and i32 %2, 524288
|
||||
%8 = icmp eq i32 %7, 0
|
||||
br i1 %8, label %9, label %13
|
||||
|
||||
; <label>:9 ; preds = %6
|
||||
%10 = and i32 %3, 67108864
|
||||
%11 = icmp eq i32 %10, 0
|
||||
br i1 %11, label %12, label %13
|
||||
|
||||
; <label>:12 ; preds = %9
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
; <label>:13 ; preds = %9, %6, %0
|
||||
%.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
|
||||
ret i32 %.0
|
||||
}
|
||||
|
||||
|
||||
;; This function is called by each of the dispatch functions we generate;
|
||||
;; it sets @__system_best_isa if it is unset.
|
||||
|
||||
define void @__set_system_isa() {
|
||||
entry:
|
||||
%bi = load i32* @__system_best_isa
|
||||
%unset = icmp eq i32 %bi, -1
|
||||
br i1 %unset, label %set_system_isa, label %done
|
||||
|
||||
set_system_isa:
|
||||
%bival = call i32 @__get_system_isa()
|
||||
store i32 %bival, i32* @__system_best_isa
|
||||
ret void
|
||||
|
||||
done:
|
||||
ret void
|
||||
}
|
||||
|
||||
417
builtins-sse.ll
Normal file
417
builtins-sse.ll
Normal file
@@ -0,0 +1,417 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;; This file declares implementations of various stdlib builtins that
|
||||
;; only require SSE version 1 and 2 functionality; this file, in turn
|
||||
;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
|
||||
;; those definitions for them.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
int64minmax(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <4 x float> %0, %call
|
||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
||||
ret <4 x float> %iv_mul
|
||||
}
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; do the rcpss call
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration to improve precision, as above
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <4 x float> %v, %is
|
||||
%v_is_is = fmul <4 x float> %v_is, %is
|
||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <4 x float> %is, %three_sub
|
||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <4 x float> %half_scale
|
||||
}
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math mode
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||
store <4 x float> %s, <4 x float> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = add <4 x i32> %v1, %v
|
||||
%m1a = extractelement <4 x i32> %m1, i32 0
|
||||
%m1b = extractelement <4 x i32> %m1, i32 1
|
||||
%sum = add i32 %m1a, %m1b
|
||||
ret i32 %sum
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = fadd <2 x double> %v0, %v1
|
||||
%e0 = extractelement <2 x double> %sum, i32 0
|
||||
%e1 = extractelement <2 x double> %sum, i32 1
|
||||
%m = fadd double %e0, %e1
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = add <2 x i64> %v0, %v1
|
||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||
%m = add i64 %e0, %e1
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
load_masked(4, i8, 8, 1)
|
||||
load_masked(4, i16, 16, 2)
|
||||
load_masked(4, i32, 32, 4)
|
||||
load_masked(4, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
@@ -1,266 +0,0 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; do the rcpss call
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration to improve precision, as above
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math mode
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @__max_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
|
||||
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
|
||||
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
|
||||
%binop21.i = fadd float %binop.i, -8.388608e+06
|
||||
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
|
||||
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
|
||||
ret float %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp ogt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, -1082130432
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp olt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, 1065353216
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare double @round(double)
|
||||
declare double @floor(double)
|
||||
declare double @ceil(double)
|
||||
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @round(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @floor(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @ceil(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32)
|
||||
declare i64 @llvm.ctpop.i64(i64)
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%val = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
%val = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %val
|
||||
}
|
||||
|
||||
|
||||
@@ -1,631 +0,0 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
;; This file defines the target for "double-pumped" SSE2, i.e. running
|
||||
;; with 8-wide vectors
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; standard 8-wide definitions from m4 macros
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
|
||||
include(`builtins-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul <8 x float> %0, %call
|
||||
%two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <8 x float> %call, %two_minus
|
||||
ret <8 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <8 x float> %v, %is
|
||||
%v_is_is = fmul <8 x float> %v_is, %is
|
||||
%three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <8 x float> %is, %three_sub
|
||||
%half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <8 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_sinf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_cosf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
<8 x float> *) nounwind readnone alwaysinline {
|
||||
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
%cospa = alloca <4 x float>
|
||||
%sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
|
||||
|
||||
%cospb = alloca <4 x float>
|
||||
%sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
|
||||
|
||||
%sin = shufflevector <4 x float> %sa, <4 x float> %sb,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x float> %sin, <8 x float> * %1
|
||||
|
||||
%cosa = load <4 x float> * %cospa
|
||||
%cosb = load <4 x float> * %cospb
|
||||
%cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x float> %cos, <8 x float> * %2
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_tanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_atanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan2(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_expf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_logf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_pow(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
; There is no blend instruction with SSE2, so we simulate it with bit
|
||||
; operations on i32s. For these two vselect functions, for each
|
||||
; vector element, if the mask is on, we return the corresponding value
|
||||
; from %1, and otherwise return the value from %0.
|
||||
|
||||
define <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> ,
|
||||
<8 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
||||
%cleared_old = and <8 x i32> %0, %notmask
|
||||
%masked_new = and <8 x i32> %1, %mask
|
||||
%new = or <8 x i32> %cleared_old, %masked_new
|
||||
ret <8 x i32> %new
|
||||
}
|
||||
|
||||
define <8 x float> @__vselect_float(<8 x float>, <8 x float>,
|
||||
<8 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%v0 = bitcast <8 x float> %0 to <8 x i32>
|
||||
%v1 = bitcast <8 x float> %1 to <8 x i32>
|
||||
%r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask)
|
||||
%rf = bitcast <8 x i32> %r to <8 x float>
|
||||
ret <8 x float> %rf
|
||||
}
|
||||
|
||||
|
||||
; To do vector integer min and max, we do the vector compare and then sign
|
||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||
; rest...
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp slt <8 x i32> %0, %1
|
||||
%mask = sext <8 x i1> %c to <8 x i32>
|
||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp slt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt <8 x i32> %0, %1
|
||||
%mask = sext <8 x i1> %c to <8 x i32>
|
||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; The functions for unsigned ints are similar, just with unsigned
|
||||
; comparison functions...
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ult <8 x i32> %0, %1
|
||||
%mask = sext <8 x i1> %c to <8 x i32>
|
||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ult i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt <8 x i32> %0, %1
|
||||
%mask = sext <8 x i1> %c to <8 x i32>
|
||||
%v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
|
||||
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
|
||||
|
||||
; and shift the first one over by 4 before ORing it with the value
|
||||
; of the second one
|
||||
%v1s = shl i32 %v1, 4
|
||||
%v = or i32 %v0, %v1s
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define <4 x float> @__vec4_add_float(<4 x float> %v0,
|
||||
<4 x float> %v1) nounwind readnone alwaysinline {
|
||||
%v = fadd <4 x float> %v0, %v1
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
define float @__add_float(float, float) nounwind readnone alwaysinline {
|
||||
%v = fadd float %0, %1
|
||||
ret float %v
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8by4(float, @__vec4_add_float, @__add_float)
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
%v = add <4 x i32> %v0, %v1
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%v = add i32 %0, %1
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @__vec4_add_int32, @__add_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
define <4 x double> @__add_varying_double(<4 x double>,
|
||||
<4 x double>) nounwind readnone alwaysinline {
|
||||
%r = fadd <4 x double> %0, %1
|
||||
ret <4 x double> %r
|
||||
}
|
||||
|
||||
define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
%r = fadd double %0, %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%r = add i64 %0, %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(8, i8, 8)
|
||||
load_and_broadcast(8, i16, 16)
|
||||
load_and_broadcast(8, i32, 32)
|
||||
load_and_broadcast(8, i64, 64)
|
||||
|
||||
load_masked(8, i8, 8, 1)
|
||||
load_masked(8, i16, 16, 2)
|
||||
load_masked(8, i32, 32, 4)
|
||||
load_masked(8, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
gen_scatter(8, i32)
|
||||
gen_scatter(8, i64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float rounding
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
|
||||
%bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||
%bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
|
||||
%binop21.i = fadd <8 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
|
||||
%bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
|
||||
ret <8 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
|
||||
%bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <8 x float> %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
|
||||
%bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
|
||||
%binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <8 x float> %binop.i
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
unary1to8(double, @round)
|
||||
}
|
||||
|
||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
unary1to8(double, @floor)
|
||||
}
|
||||
|
||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
unary1to8(double, @ceil)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
gen_masked_store(8, i32, 32)
|
||||
gen_masked_store(8, i64, 64)
|
||||
|
||||
masked_store_blend_8_16_by_8()
|
||||
|
||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <8 x i32> * %0, align 4
|
||||
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
|
||||
store <8 x i32> %newval, <8 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
<8 x i32> %mask) nounwind alwaysinline {
|
||||
%oldValue = load <8 x i64>* %ptr, align 8
|
||||
|
||||
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||
; are actually bitcast <2 x i64> values
|
||||
;
|
||||
; set up the first two 64-bit values
|
||||
%old0123 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%old0123f = bitcast <4 x i64> %old0123 to <8 x float>
|
||||
%new0123 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%new0123f = bitcast <4 x i64> %new0123 to <8 x float>
|
||||
; compute mask--note that the indices are doubled-up
|
||||
%mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend the first 4 values
|
||||
%result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f,
|
||||
<8 x i32> %mask0123)
|
||||
%result0123 = bitcast <8 x float> %result0123f to <4 x i64>
|
||||
|
||||
; and again
|
||||
%old4567 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%old4567f = bitcast <4 x i64> %old4567 to <8 x float>
|
||||
%new4567 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%new4567f = bitcast <4 x i64> %new4567 to <8 x float>
|
||||
; compute mask--note that the values are doubled-up
|
||||
%mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
; and blend the two of the values
|
||||
%result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f,
|
||||
<8 x i32> %mask4567)
|
||||
%result4567 = bitcast <8 x float> %result4567f to <4 x i64>
|
||||
|
||||
; reconstruct the final <8 x i64> vector
|
||||
%final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
store <8 x i64> %final, <8 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision float min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
380
builtins-sse2.ll
380
builtins-sse2.ll
@@ -36,9 +36,9 @@
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
int64minmax(4)
|
||||
|
||||
include(`builtins-sse2-common.ll')
|
||||
; Include the various definitions of things that only require SSE1 and SSE2
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
@@ -62,7 +62,7 @@ include(`builtins-sse2-common.ll')
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
|
||||
%bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
|
||||
%bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
@@ -75,6 +75,19 @@ define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysi
|
||||
ret <4 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
|
||||
%bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
|
||||
%bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
|
||||
%binop21.i = fadd float %binop.i, -8.388608e+06
|
||||
%float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
|
||||
%bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
|
||||
ret float %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
@@ -88,7 +101,7 @@ define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysi
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
||||
@@ -98,6 +111,16 @@ define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysi
|
||||
ret <4 x float> %binop.i
|
||||
}
|
||||
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp ogt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, -1082130432
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
@@ -110,7 +133,7 @@ define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysi
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
|
||||
@@ -120,21 +143,50 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
|
||||
ret <4 x float> %binop.i
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
|
||||
%bincmp.i = fcmp olt float %calltmp.i, %0
|
||||
%selectexpr.i = sext i1 %bincmp.i to i32
|
||||
%bitop.i = and i32 %selectexpr.i, 1065353216
|
||||
%int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
|
||||
%binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret float %binop.i
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
declare double @round(double)
|
||||
declare double @floor(double)
|
||||
declare double @ceil(double)
|
||||
|
||||
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @round)
|
||||
}
|
||||
|
||||
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @round(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @floor)
|
||||
}
|
||||
|
||||
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @floor(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
unary1to4(double, @ceil)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%r = call double @ceil(double %0)
|
||||
ret double %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; min/max
|
||||
|
||||
@@ -143,7 +195,7 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
|
||||
; vector element, if the mask is on, we return the corresponding value
|
||||
; from %1, and otherwise return the value from %0.
|
||||
|
||||
define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
||||
define internal <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
|
||||
%cleared_old = and <4 x i32> %0, %notmask
|
||||
@@ -152,7 +204,7 @@ define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
|
||||
ret <4 x i32> %new
|
||||
}
|
||||
|
||||
define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
||||
define internal <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
||||
<4 x i32> %mask) nounwind readnone alwaysinline {
|
||||
%v0 = bitcast <4 x float> %0 to <4 x i32>
|
||||
%v1 = bitcast <4 x float> %1 to <4 x i32>
|
||||
@@ -166,27 +218,27 @@ define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
|
||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||
; rest...
|
||||
|
||||
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp slt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp slt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
@@ -195,27 +247,27 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
; The functions for unsigned ints are similar, just with unsigned
|
||||
; comparison functions...
|
||||
|
||||
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ult <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ult i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt <4 x i32> %0, %1
|
||||
%mask = sext <4 x i1> %c to <4 x i32>
|
||||
%v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
@@ -225,15 +277,21 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
declare i32 @llvm.ctpop.i32(i32)
|
||||
declare i64 @llvm.ctpop.i64(i64)
|
||||
|
||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%val = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
%val = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %val
|
||||
}
|
||||
|
||||
|
||||
define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = fadd <4 x float> %v1, %v
|
||||
@@ -243,96 +301,6 @@ define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline
|
||||
ret float %sum
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = add <4 x i32> %v1, %v
|
||||
%m1a = extractelement <4 x i32> %m1, i32 0
|
||||
%m1b = extractelement <4 x i32> %m1, i32 1
|
||||
%sum = add i32 %m1a, %m1b
|
||||
ret i32 %sum
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = fadd <2 x double> %v0, %v1
|
||||
%e0 = extractelement <2 x double> %sum, i32 0
|
||||
%e1 = extractelement <2 x double> %sum, i32 1
|
||||
%m = fadd double %e0, %e1
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = add <2 x i64> %v0, %v1
|
||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||
%m = add i64 %e0, %e1
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(4)
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
@@ -387,187 +355,3 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <4 x float> %0, %call
|
||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
||||
ret <4 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <4 x float> %v, %is
|
||||
%v_is_is = fmul <4 x float> %v_is, %is
|
||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <4 x float> %is, %three_sub
|
||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <4 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||
store <4 x float> %s, <4 x float> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
load_masked(4, i8, 8, 1)
|
||||
load_masked(4, i16, 16, 2)
|
||||
load_masked(4, i32, 32, 4)
|
||||
load_masked(4, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
|
||||
@@ -1,271 +0,0 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both. Further, only the 0th
|
||||
; element of the b parameter matters
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; do the rcpss call
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration to improve precision, as above
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math mode
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
define double @__max_uniform_double(double, double) nounwind readnone {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
402
builtins-sse4.ll
402
builtins-sse4.ll
@@ -36,334 +36,200 @@
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
int64minmax(4)
|
||||
|
||||
include(`builtins-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <4 x float> %0, %call
|
||||
%two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <4 x float> %call, %two_minus
|
||||
ret <4 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
%is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <4 x float> %v, %is
|
||||
%v_is_is = fmul <4 x float> %v_is, %is
|
||||
%three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <4 x float> %is, %three_sub
|
||||
%half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
ret <4 x float> %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
|
||||
unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
; Define the stuff that can be done with base SSE1/SSE2 instructions
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both. Further, only the 0th
|
||||
; element of the b parameter matters
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
round2to4double(%0, 8)
|
||||
}
|
||||
|
||||
define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round2to4double(%0, 9)
|
||||
}
|
||||
|
||||
define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round2to4double(%0, 10)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
}
|
||||
|
||||
define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %call
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
|
||||
%call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
|
||||
ret <4 x i32> %call
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
|
||||
binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <4 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
|
||||
%s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
|
||||
store <4 x float> %s, <4 x float> * %1
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_expf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_logf4(<4 x float> %0)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
|
||||
%ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
ret i32 %v
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
|
||||
%v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
|
||||
%v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
|
||||
%scalar = extractelement <4 x float> %v2, i32 0
|
||||
ret float %scalar
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__min_varying_float, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<4 x float>) nounwind readnone {
|
||||
reduce4(float, @__max_varying_float, @__max_uniform_float)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
|
||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
%m1 = add <4 x i32> %v1, %v
|
||||
%m1a = extractelement <4 x i32> %m1, i32 0
|
||||
%m1b = extractelement <4 x i32> %m1, i32 1
|
||||
%sum = add i32 %m1a, %m1b
|
||||
ret i32 %sum
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<4 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
|
||||
reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<4 x double>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x double> %0, <4 x double> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = fadd <2 x double> %v0, %v1
|
||||
%e0 = extractelement <2 x double> %sum, i32 0
|
||||
%e1 = extractelement <2 x double> %sum, i32 1
|
||||
%m = fadd double %e0, %e1
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<4 x double>) nounwind readnone {
|
||||
reduce4(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
|
||||
%v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 0, i32 1>
|
||||
%v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
|
||||
<2 x i32> <i32 2, i32 3>
|
||||
%sum = add <2 x i64> %v0, %v1
|
||||
%e0 = extractelement <2 x i64> %sum, i32 0
|
||||
%e1 = extractelement <2 x i64> %sum, i32 1
|
||||
%m = add i64 %e0, %e1
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
reduce_equal(4)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
@@ -432,41 +298,3 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
store <4 x i64> %final, <4 x i64> * %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
|
||||
gen_masked_store(4, i8, 8)
|
||||
gen_masked_store(4, i16, 16)
|
||||
gen_masked_store(4, i32, 32)
|
||||
gen_masked_store(4, i64, 64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(4, i8, 8)
|
||||
load_and_broadcast(4, i16, 16)
|
||||
load_and_broadcast(4, i32, 32)
|
||||
load_and_broadcast(4, i64, 64)
|
||||
|
||||
load_masked(4, i8, 8, 1)
|
||||
load_masked(4, i16, 16, 2)
|
||||
load_masked(4, i32, 32, 4)
|
||||
load_masked(4, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(4, i8)
|
||||
gen_gather(4, i16)
|
||||
gen_gather(4, i32)
|
||||
gen_gather(4, i64)
|
||||
|
||||
gen_scatter(4, i8)
|
||||
gen_scatter(4, i16)
|
||||
gen_scatter(4, i32)
|
||||
gen_scatter(4, i64)
|
||||
|
||||
@@ -41,14 +41,13 @@ packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
|
||||
include(`builtins-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
|
||||
@@ -61,12 +60,27 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
|
||||
ret <8 x float> %iv_mul
|
||||
}
|
||||
|
||||
define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%vecval = insertelement <4 x float> undef, float %0, i32 0
|
||||
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
|
||||
%scall = extractelement <4 x float> %call, i32 0
|
||||
|
||||
; do one N-R iteration
|
||||
%v_iv = fmul float %0, %scall
|
||||
%two_minus = fsub float 2., %v_iv
|
||||
%iv_mul = fmul float %scall, %two_minus
|
||||
ret float %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
@@ -80,16 +94,56 @@ define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwa
|
||||
ret <8 x float> %half_scale
|
||||
}
|
||||
|
||||
define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float is = extract(__rsqrt_u(v), 0);
|
||||
%v = insertelement <4 x float> undef, float %0, i32 0
|
||||
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
|
||||
%is = extractelement <4 x float> %vis, i32 0
|
||||
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul float %0, %is
|
||||
%v_is_is = fmul float %v_is, %is
|
||||
%three_sub = fsub float 3., %v_is_is
|
||||
%is_mul = fmul float %is, %three_sub
|
||||
%half_scale = fmul float 0.5, %is_mul
|
||||
ret float %half_scale
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fast math
|
||||
|
||||
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
|
||||
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
|
||||
|
||||
define internal void @__fastmath() nounwind alwaysinline {
|
||||
%ptr = alloca i32
|
||||
%ptr8 = bitcast i32 * %ptr to i8 *
|
||||
call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
|
||||
%oldval = load i32 *%ptr
|
||||
|
||||
; turn on DAZ (64)/FTZ (32768) -> 32832
|
||||
%update = or i32 %oldval, 32832
|
||||
store i32 %update, i32 *%ptr
|
||||
call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
@@ -104,17 +158,17 @@ declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
|
||||
declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_sinf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_cosf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
define internal void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
<8 x float> *) nounwind readnone alwaysinline {
|
||||
; call svml_sincosf4 two times with the two 4-wide sub-vectors
|
||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
@@ -143,33 +197,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
|
||||
ret void
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_tanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_atanf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_atan2(<8 x float>,
|
||||
define internal <8 x float> @__svml_atan2(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_atan2f4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_expf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
|
||||
unary4to8(ret, float, @__svml_logf4, %0)
|
||||
ret <8 x float> %ret
|
||||
}
|
||||
|
||||
define <8 x float> @__svml_pow(<8 x float>,
|
||||
define internal <8 x float> @__svml_pow(<8 x float>,
|
||||
<8 x float>) nounwind readnone alwaysinline {
|
||||
binary4to8(ret, float, @__svml_powf4, %0, %1)
|
||||
ret <8 x float> %ret
|
||||
@@ -180,52 +234,91 @@ define <8 x float> @__svml_pow(<8 x float>,
|
||||
;; float min/max
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
|
||||
ret <8 x float> %call
|
||||
}
|
||||
|
||||
define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
|
||||
|
||||
define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
|
||||
<8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %call
|
||||
}
|
||||
|
||||
define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
|
||||
define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
; first do two 4-wide movmsk calls
|
||||
%floatmask = bitcast <8 x i32> %0 to <8 x float>
|
||||
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
|
||||
@@ -242,103 +335,103 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
|
||||
}
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||
define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
|
||||
<4 x i32> %v1) nounwind readnone alwaysinline {
|
||||
%v = add <4 x i32> %v0, %v1
|
||||
ret <4 x i32> %v
|
||||
}
|
||||
|
||||
; helper function for reduce_add_int32
|
||||
define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
|
||||
%v = add i32 %0, %1
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @__vec4_add_int32, @__add_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
|
||||
%r = call i32 @__reduce_add_int32(<8 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
|
||||
reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
|
||||
}
|
||||
|
||||
define <4 x double> @__add_varying_double(<4 x double>,
|
||||
define internal <4 x double> @__add_varying_double(<4 x double>,
|
||||
<4 x double>) nounwind readnone alwaysinline {
|
||||
%r = fadd <4 x double> %0, %1
|
||||
ret <4 x double> %r
|
||||
}
|
||||
|
||||
define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
%r = fadd double %0, %1
|
||||
ret double %r
|
||||
}
|
||||
|
||||
define double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||
define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
|
||||
reduce8by4(double, @__add_varying_double, @__add_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||
define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__min_varying_double, @__min_uniform_double)
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
|
||||
reduce8(double, @__max_varying_double, @__max_uniform_double)
|
||||
}
|
||||
|
||||
define <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
define internal <4 x i64> @__add_varying_int64(<4 x i64>,
|
||||
<4 x i64>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i64> %0, %1
|
||||
ret <4 x i64> %r
|
||||
}
|
||||
|
||||
define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
|
||||
%r = add i64 %0, %1
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||
define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||
define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||
define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||
define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
@@ -374,47 +467,129 @@ gen_scatter(8, i64)
|
||||
;; float rounding
|
||||
|
||||
declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
round4to8(%0, 8)
|
||||
}
|
||||
|
||||
define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round4to8(%0, 9)
|
||||
}
|
||||
|
||||
define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round4to8(%0, 10)
|
||||
}
|
||||
|
||||
define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
|
||||
|
||||
define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
round2to8double(%0, 8)
|
||||
}
|
||||
|
||||
define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
round2to8double(%0, 9)
|
||||
}
|
||||
|
||||
define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
|
||||
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
round2to8double(%0, 10)
|
||||
}
|
||||
|
||||
define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <2 x double> undef, double %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
|
||||
%rs = extractelement <2 x double> %xr, i32 0
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
||||
%a = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%b = shufflevector <8 x float> %0, <8 x float> undef,
|
||||
@@ -543,24 +718,44 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
;; double precision sqrt
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
|
||||
unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
|
||||
define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
|
||||
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision float min/max
|
||||
|
||||
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
|
||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret double %ret
|
||||
|
||||
}
|
||||
432
builtins.cpp
432
builtins.cpp
@@ -114,39 +114,59 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
|
||||
// pointers to uniform
|
||||
else if (t == LLVMTypes::Int8PointerType)
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt8 :
|
||||
AtomicType::UniformInt8);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
|
||||
AtomicType::UniformInt8, false);
|
||||
else if (t == LLVMTypes::Int16PointerType)
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt16 :
|
||||
AtomicType::UniformInt16);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
|
||||
AtomicType::UniformInt16, false);
|
||||
else if (t == LLVMTypes::Int32PointerType)
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt32 :
|
||||
AtomicType::UniformInt32);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
|
||||
AtomicType::UniformInt32, false);
|
||||
else if (t == LLVMTypes::Int64PointerType)
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt64 :
|
||||
AtomicType::UniformInt64);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
|
||||
AtomicType::UniformInt64, false);
|
||||
else if (t == LLVMTypes::FloatPointerType)
|
||||
return PointerType::GetUniform(AtomicType::UniformFloat);
|
||||
return new ReferenceType(AtomicType::UniformFloat, false);
|
||||
else if (t == LLVMTypes::DoublePointerType)
|
||||
return PointerType::GetUniform(AtomicType::UniformDouble);
|
||||
return new ReferenceType(AtomicType::UniformDouble, false);
|
||||
|
||||
// pointers to varying
|
||||
else if (t == LLVMTypes::Int8VectorPointerType)
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt8 :
|
||||
AtomicType::VaryingInt8);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
|
||||
AtomicType::VaryingInt8, false);
|
||||
else if (t == LLVMTypes::Int16VectorPointerType)
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt16 :
|
||||
AtomicType::VaryingInt16);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
|
||||
AtomicType::VaryingInt16, false);
|
||||
else if (t == LLVMTypes::Int32VectorPointerType)
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt32 :
|
||||
AtomicType::VaryingInt32);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
|
||||
AtomicType::VaryingInt32, false);
|
||||
else if (t == LLVMTypes::Int64VectorPointerType)
|
||||
return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt64 :
|
||||
AtomicType::VaryingInt64);
|
||||
return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
|
||||
AtomicType::VaryingInt64, false);
|
||||
else if (t == LLVMTypes::FloatVectorPointerType)
|
||||
return PointerType::GetUniform(AtomicType::VaryingFloat);
|
||||
return new ReferenceType(AtomicType::VaryingFloat, false);
|
||||
else if (t == LLVMTypes::DoubleVectorPointerType)
|
||||
return PointerType::GetUniform(AtomicType::VaryingDouble);
|
||||
return new ReferenceType(AtomicType::VaryingDouble, false);
|
||||
|
||||
// arrays
|
||||
else if (llvm::isa<const llvm::PointerType>(t)) {
|
||||
const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
|
||||
|
||||
// Is it a pointer to an unsized array of objects? If so, then
|
||||
// create the equivalent ispc type. Note that it has to be a
|
||||
// reference to an array, since ispc passes arrays to functions by
|
||||
// reference.
|
||||
const llvm::ArrayType *at =
|
||||
llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
|
||||
if (at != NULL) {
|
||||
const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
|
||||
intAsUnsigned);
|
||||
if (eltType == NULL)
|
||||
return NULL;
|
||||
return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@@ -161,9 +181,11 @@ lCreateSymbol(const std::string &name, const Type *returnType,
|
||||
noPos.name = "__stdlib";
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
|
||||
Debug(noPos, "Created builtin symbol \"%s\" [%s]\n", name.c_str(),
|
||||
funcType->GetString().c_str());
|
||||
// set NULL default arguments
|
||||
std::vector<ConstExpr *> defaults;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
|
||||
defaults.push_back(NULL);
|
||||
funcType->SetArgumentDefaults(defaults);
|
||||
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
@@ -186,9 +208,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
if (name.size() < 3 || name[0] != '_' || name[1] != '_')
|
||||
return false;
|
||||
|
||||
Debug(SourcePos(), "Attempting to create ispc symbol for function \"%s\".",
|
||||
name.c_str());
|
||||
|
||||
// An unfortunate hack: we want this builtin function to have the
|
||||
// signature "int __sext_varying_bool(bool)", but the ispc function
|
||||
// symbol creation code below assumes that any LLVM vector of i32s is a
|
||||
@@ -198,8 +217,11 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
const Type *returnType = AtomicType::VaryingInt32;
|
||||
std::vector<const Type *> argTypes;
|
||||
argTypes.push_back(AtomicType::VaryingBool);
|
||||
std::vector<ConstExpr *> defaults;
|
||||
defaults.push_back(NULL);
|
||||
|
||||
FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
|
||||
funcType->SetArgumentDefaults(defaults);
|
||||
|
||||
Symbol *sym = new Symbol(name, noPos, funcType);
|
||||
sym->function = func;
|
||||
@@ -216,27 +238,22 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
|
||||
const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
|
||||
intAsUnsigned);
|
||||
if (returnType == NULL) {
|
||||
Debug(SourcePos(), "Failed: return type not representable for "
|
||||
"builtin %s.", name.c_str());
|
||||
if (!returnType)
|
||||
// return type not representable in ispc -> not callable from ispc
|
||||
return false;
|
||||
}
|
||||
|
||||
// Iterate over the arguments and try to find their equivalent ispc
|
||||
// types. Track if any of the arguments has an integer type.
|
||||
bool anyIntArgs = false;
|
||||
bool anyIntArgs = false, anyReferenceArgs = false;
|
||||
std::vector<const Type *> argTypes;
|
||||
for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
|
||||
const llvm::Type *llvmArgType = ftype->getParamType(j);
|
||||
const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
|
||||
if (type == NULL) {
|
||||
Debug(SourcePos(), "Failed: type of parameter %d not "
|
||||
"representable for builtin %s", j, name.c_str());
|
||||
if (type == NULL)
|
||||
return false;
|
||||
}
|
||||
anyIntArgs |=
|
||||
(Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
|
||||
anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
|
||||
argTypes.push_back(type);
|
||||
}
|
||||
|
||||
@@ -244,6 +261,19 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
|
||||
// so that we get symbols for things with no integer types!
|
||||
if (i == 0 || anyIntArgs == true)
|
||||
lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
|
||||
|
||||
// If there are any reference types, also make a variant of the
|
||||
// symbol that has them as const references. This obviously
|
||||
// doesn't make sense for many builtins, but we'll give the stdlib
|
||||
// the option to call one if it needs one.
|
||||
if (anyReferenceArgs == true) {
|
||||
for (unsigned int j = 0; j < argTypes.size(); ++j) {
|
||||
if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
|
||||
argTypes[j] = argTypes[j]->GetAsConstType();
|
||||
lCreateSymbol(name + "_refsconst", returnType, argTypes,
|
||||
ftype, func, symbolTable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -297,263 +327,6 @@ lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
}
|
||||
|
||||
|
||||
/** We'd like to have all of these functions declared as 'internal' in
|
||||
their respective bitcode files so that if they aren't needed by the
|
||||
user's program they are elimiated from the final output. However, if
|
||||
we do so, then they aren't brought in by the LinkModules() call below
|
||||
since they aren't yet used by anything in the module they're being
|
||||
linked with (in LLVM 3.1, at least).
|
||||
|
||||
Therefore, we don't declare them as internal when we first define them,
|
||||
but instead mark them as internal after they've been linked in. This
|
||||
is admittedly a kludge.
|
||||
*/
|
||||
static void
|
||||
lSetInternalFunctions(llvm::Module *module) {
|
||||
const char *names[] = {
|
||||
"__add_uniform_int32",
|
||||
"__add_uniform_int64",
|
||||
"__add_varying_int32",
|
||||
"__add_varying_int64",
|
||||
"__aos_to_soa3_float",
|
||||
"__aos_to_soa3_float16",
|
||||
"__aos_to_soa3_float4",
|
||||
"__aos_to_soa3_float8",
|
||||
"__aos_to_soa3_int32",
|
||||
"__aos_to_soa4_float",
|
||||
"__aos_to_soa4_float16",
|
||||
"__aos_to_soa4_float4",
|
||||
"__aos_to_soa4_float8",
|
||||
"__aos_to_soa4_int32",
|
||||
"__atomic_add_int32_global",
|
||||
"__atomic_add_int64_global",
|
||||
"__atomic_add_uniform_int32_global",
|
||||
"__atomic_add_uniform_int64_global",
|
||||
"__atomic_and_int32_global",
|
||||
"__atomic_and_int64_global",
|
||||
"__atomic_and_uniform_int32_global",
|
||||
"__atomic_and_uniform_int64_global",
|
||||
"__atomic_compare_exchange_double_global",
|
||||
"__atomic_compare_exchange_float_global",
|
||||
"__atomic_compare_exchange_int32_global",
|
||||
"__atomic_compare_exchange_int64_global",
|
||||
"__atomic_compare_exchange_uniform_double_global",
|
||||
"__atomic_compare_exchange_uniform_float_global",
|
||||
"__atomic_compare_exchange_uniform_int32_global",
|
||||
"__atomic_compare_exchange_uniform_int64_global",
|
||||
"__atomic_max_uniform_int32_global",
|
||||
"__atomic_max_uniform_int64_global",
|
||||
"__atomic_min_uniform_int32_global",
|
||||
"__atomic_min_uniform_int64_global",
|
||||
"__atomic_or_int32_global",
|
||||
"__atomic_or_int64_global",
|
||||
"__atomic_or_uniform_int32_global",
|
||||
"__atomic_or_uniform_int64_global",
|
||||
"__atomic_sub_int32_global",
|
||||
"__atomic_sub_int64_global",
|
||||
"__atomic_sub_uniform_int32_global",
|
||||
"__atomic_sub_uniform_int64_global",
|
||||
"__atomic_swap_double_global",
|
||||
"__atomic_swap_float_global",
|
||||
"__atomic_swap_int32_global",
|
||||
"__atomic_swap_int64_global",
|
||||
"__atomic_swap_uniform_double_global",
|
||||
"__atomic_swap_uniform_float_global",
|
||||
"__atomic_swap_uniform_int32_global",
|
||||
"__atomic_swap_uniform_int64_global",
|
||||
"__atomic_umax_uniform_uint32_global",
|
||||
"__atomic_umax_uniform_uint64_global",
|
||||
"__atomic_umin_uniform_uint32_global",
|
||||
"__atomic_umin_uniform_uint64_global",
|
||||
"__atomic_xor_int32_global",
|
||||
"__atomic_xor_int64_global",
|
||||
"__atomic_xor_uniform_int32_global",
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
"__broadcast_double",
|
||||
"__broadcast_float",
|
||||
"__broadcast_int16",
|
||||
"__broadcast_int32",
|
||||
"__broadcast_int64",
|
||||
"__broadcast_int8",
|
||||
"__ceil_uniform_double",
|
||||
"__ceil_uniform_float",
|
||||
"__ceil_varying_double",
|
||||
"__ceil_varying_float",
|
||||
"__count_trailing_zeros_i32",
|
||||
"__count_trailing_zeros_i64",
|
||||
"__count_leading_zeros_i32",
|
||||
"__count_leading_zeros_i64",
|
||||
"__do_assert_uniform",
|
||||
"__do_assert_varying",
|
||||
"__do_print",
|
||||
"__doublebits_uniform_int64",
|
||||
"__doublebits_varying_int64",
|
||||
"__exclusive_scan_add_double",
|
||||
"__exclusive_scan_add_float",
|
||||
"__exclusive_scan_add_i32",
|
||||
"__exclusive_scan_add_i64",
|
||||
"__exclusive_scan_and_i32",
|
||||
"__exclusive_scan_and_i64",
|
||||
"__exclusive_scan_or_i32",
|
||||
"__exclusive_scan_or_i64",
|
||||
"__extract_int16",
|
||||
"__extract_int32",
|
||||
"__extract_int64",
|
||||
"__extract_int8",
|
||||
"__fastmath",
|
||||
"__floatbits_uniform_int32",
|
||||
"__floatbits_varying_int32",
|
||||
"__floor_uniform_double",
|
||||
"__floor_uniform_float",
|
||||
"__floor_varying_double",
|
||||
"__floor_varying_float",
|
||||
"__insert_int16",
|
||||
"__insert_int32",
|
||||
"__insert_int64",
|
||||
"__insert_int8",
|
||||
"__intbits_uniform_double",
|
||||
"__intbits_uniform_float",
|
||||
"__intbits_varying_double",
|
||||
"__intbits_varying_float",
|
||||
"__max_uniform_double",
|
||||
"__max_uniform_float",
|
||||
"__max_uniform_int32",
|
||||
"__max_uniform_int64",
|
||||
"__max_uniform_uint32",
|
||||
"__max_uniform_uint64",
|
||||
"__max_varying_double",
|
||||
"__max_varying_float",
|
||||
"__max_varying_int32",
|
||||
"__max_varying_int64",
|
||||
"__max_varying_uint32",
|
||||
"__max_varying_uint64",
|
||||
"__memory_barrier",
|
||||
"__min_uniform_double",
|
||||
"__min_uniform_float",
|
||||
"__min_uniform_int32",
|
||||
"__min_uniform_int64",
|
||||
"__min_uniform_uint32",
|
||||
"__min_uniform_uint64",
|
||||
"__min_varying_double",
|
||||
"__min_varying_float",
|
||||
"__min_varying_int32",
|
||||
"__min_varying_int64",
|
||||
"__min_varying_uint32",
|
||||
"__min_varying_uint64",
|
||||
"__movmsk",
|
||||
"__num_cores",
|
||||
"__packed_load_active",
|
||||
"__packed_store_active",
|
||||
"__popcnt_int32",
|
||||
"__popcnt_int64",
|
||||
"__prefetch_read_uniform_1",
|
||||
"__prefetch_read_uniform_2",
|
||||
"__prefetch_read_uniform_3",
|
||||
"__prefetch_read_uniform_nt",
|
||||
"__rcp_uniform_float",
|
||||
"__rcp_varying_float",
|
||||
"__reduce_add_double",
|
||||
"__reduce_add_float",
|
||||
"__reduce_add_int32",
|
||||
"__reduce_add_int64",
|
||||
"__reduce_add_uint32",
|
||||
"__reduce_add_uint64",
|
||||
"__reduce_equal_double",
|
||||
"__reduce_equal_float",
|
||||
"__reduce_equal_int32",
|
||||
"__reduce_equal_int64",
|
||||
"__reduce_max_double",
|
||||
"__reduce_max_float",
|
||||
"__reduce_max_int32",
|
||||
"__reduce_max_int64",
|
||||
"__reduce_max_uint32",
|
||||
"__reduce_max_uint64",
|
||||
"__reduce_min_double",
|
||||
"__reduce_min_float",
|
||||
"__reduce_min_int32",
|
||||
"__reduce_min_int64",
|
||||
"__reduce_min_uint32",
|
||||
"__reduce_min_uint64",
|
||||
"__rotate_double",
|
||||
"__rotate_float",
|
||||
"__rotate_int16",
|
||||
"__rotate_int32",
|
||||
"__rotate_int64",
|
||||
"__rotate_int8",
|
||||
"__round_uniform_double",
|
||||
"__round_uniform_float",
|
||||
"__round_varying_double",
|
||||
"__round_varying_float",
|
||||
"__rsqrt_uniform_float",
|
||||
"__rsqrt_varying_float",
|
||||
"__sext_uniform_bool",
|
||||
"__sext_varying_bool",
|
||||
"__shuffle2_double",
|
||||
"__shuffle2_float",
|
||||
"__shuffle2_int16",
|
||||
"__shuffle2_int32",
|
||||
"__shuffle2_int64",
|
||||
"__shuffle2_int8",
|
||||
"__shuffle_double",
|
||||
"__shuffle_float",
|
||||
"__shuffle_int16",
|
||||
"__shuffle_int32",
|
||||
"__shuffle_int64",
|
||||
"__shuffle_int8",
|
||||
"__soa_to_aos3_float",
|
||||
"__soa_to_aos3_float16",
|
||||
"__soa_to_aos3_float4",
|
||||
"__soa_to_aos3_float8",
|
||||
"__soa_to_aos3_int32",
|
||||
"__soa_to_aos4_float",
|
||||
"__soa_to_aos4_float16",
|
||||
"__soa_to_aos4_float4",
|
||||
"__soa_to_aos4_float8",
|
||||
"__soa_to_aos4_int32",
|
||||
"__sqrt_uniform_double",
|
||||
"__sqrt_uniform_float",
|
||||
"__sqrt_varying_double",
|
||||
"__sqrt_varying_float",
|
||||
"__stdlib_atan",
|
||||
"__stdlib_atan2",
|
||||
"__stdlib_atan2f",
|
||||
"__stdlib_atanf",
|
||||
"__stdlib_cos",
|
||||
"__stdlib_cosf",
|
||||
"__stdlib_exp",
|
||||
"__stdlib_expf",
|
||||
"__stdlib_log",
|
||||
"__stdlib_logf",
|
||||
"__stdlib_pow",
|
||||
"__stdlib_powf",
|
||||
"__stdlib_sin",
|
||||
"__stdlib_sincos",
|
||||
"__stdlib_sincosf",
|
||||
"__stdlib_sinf",
|
||||
"__stdlib_tan",
|
||||
"__stdlib_tanf",
|
||||
"__svml_sin",
|
||||
"__svml_cos",
|
||||
"__svml_sincos",
|
||||
"__svml_tan",
|
||||
"__svml_atan",
|
||||
"__svml_atan2",
|
||||
"__svml_exp",
|
||||
"__svml_log",
|
||||
"__svml_pow",
|
||||
"__undef_uniform",
|
||||
"__undef_varying",
|
||||
};
|
||||
|
||||
int count = sizeof(names) / sizeof(names[0]);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
llvm::Function *f = module->getFunction(names[i]);
|
||||
if (f != NULL)
|
||||
f->setLinkage(llvm::GlobalValue::InternalLinkage);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** This utility function takes serialized binary LLVM bitcode and adds its
|
||||
definitions to the given module. Functions in the bitcode that can be
|
||||
mapped to ispc functions are also added to the symbol table.
|
||||
@@ -563,9 +336,9 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
@param module Module to link the bitcode into
|
||||
@param symbolTable Symbol table to add definitions to
|
||||
*/
|
||||
void
|
||||
AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable) {
|
||||
static void
|
||||
lAddBitcode(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable) {
|
||||
std::string bcErr;
|
||||
llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
|
||||
llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
|
||||
@@ -590,15 +363,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
bcModule->setTargetTriple(mTriple.str());
|
||||
|
||||
std::string(linkError);
|
||||
if (llvm::Linker::LinkModules(module, bcModule,
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
llvm::Linker::DestroySource,
|
||||
#endif // LLVM_3_0
|
||||
&linkError))
|
||||
if (llvm::Linker::LinkModules(module, bcModule, &linkError))
|
||||
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
||||
lSetInternalFunctions(module);
|
||||
if (symbolTable != NULL)
|
||||
lAddModuleSymbols(module, symbolTable);
|
||||
lAddModuleSymbols(module, symbolTable);
|
||||
lCheckModuleIntrinsics(module);
|
||||
}
|
||||
}
|
||||
@@ -610,8 +377,8 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
static void
|
||||
lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
|
||||
SC_STATIC);
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
|
||||
pw->isStatic = true;
|
||||
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
llvm::Constant *linit = LLVMInt32(val);
|
||||
@@ -628,7 +395,8 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
std::vector<const Type *> args;
|
||||
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||
Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
|
||||
Symbol *sym = new Symbol(name, SourcePos(), ft);
|
||||
sym->isStatic = true;
|
||||
|
||||
llvm::Function *func = module->getFunction(name);
|
||||
assert(func != NULL); // it should be declared already...
|
||||
@@ -645,7 +413,8 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
static void
|
||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||
AtomicType::VaryingConstInt32, SC_STATIC);
|
||||
AtomicType::VaryingConstInt32);
|
||||
pidx->isStatic = true;
|
||||
|
||||
int pi[ISPC_MAX_NVEC];
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
@@ -665,17 +434,17 @@ void
|
||||
DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||
bool includeStdlibISPC) {
|
||||
// Add the definitions from the compiled builtins-c.c file
|
||||
if (g->target.is32Bit) {
|
||||
if (g->target.is32bit) {
|
||||
extern unsigned char builtins_bitcode_c_32[];
|
||||
extern int builtins_bitcode_c_32_length;
|
||||
AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||
module, symbolTable);
|
||||
lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
else {
|
||||
extern unsigned char builtins_bitcode_c_64[];
|
||||
extern int builtins_bitcode_c_64_length;
|
||||
AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||
module, symbolTable);
|
||||
lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
|
||||
// Next, add the target's custom implementations of the various needed
|
||||
@@ -684,34 +453,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
case Target::SSE2:
|
||||
extern unsigned char builtins_bitcode_sse2[];
|
||||
extern int builtins_bitcode_sse2_length;
|
||||
extern unsigned char builtins_bitcode_sse2_x2[];
|
||||
extern int builtins_bitcode_sse2_x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
|
||||
symbolTable);
|
||||
break;
|
||||
case Target::SSE4:
|
||||
extern unsigned char builtins_bitcode_sse4[];
|
||||
extern int builtins_bitcode_sse4_length;
|
||||
extern unsigned char builtins_bitcode_sse4_x2[];
|
||||
extern int builtins_bitcode_sse4_x2_length;
|
||||
extern unsigned char builtins_bitcode_sse4x2[];
|
||||
extern int builtins_bitcode_sse4x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length,
|
||||
module, symbolTable);
|
||||
lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
@@ -722,14 +479,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx[];
|
||||
extern int builtins_bitcode_avx_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length,
|
||||
module, symbolTable);
|
||||
lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module,
|
||||
symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx_x2[];
|
||||
extern int builtins_bitcode_avx_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
module, symbolTable);
|
||||
lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
@@ -765,8 +522,11 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
// definitions added. Disable emission of performance warnings for
|
||||
// now, since the user doesn't care about any of that in the stdlib
|
||||
// implementation...
|
||||
bool epf = g->emitPerfWarnings;
|
||||
g->emitPerfWarnings = false;
|
||||
extern char stdlib_code[];
|
||||
yy_scan_string(stdlib_code);
|
||||
yyparse();
|
||||
g->emitPerfWarnings = epf;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,7 +55,4 @@
|
||||
void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||
bool includeStdlib);
|
||||
|
||||
void AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable = NULL);
|
||||
|
||||
#endif // ISPC_STDLIB_H
|
||||
|
||||
1220
builtins.m4
1220
builtins.m4
File diff suppressed because it is too large
Load Diff
195
ctx.h
195
ctx.h
@@ -41,7 +41,9 @@
|
||||
#include "ispc.h"
|
||||
#include <llvm/InstrTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#ifndef LLVM_2_8
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
#endif
|
||||
#include <llvm/Analysis/DebugInfo.h>
|
||||
|
||||
struct CFInfo;
|
||||
@@ -57,22 +59,17 @@ struct CFInfo;
|
||||
class FunctionEmitContext {
|
||||
public:
|
||||
/** Create a new FunctionEmitContext.
|
||||
@param function The Function object representing the function
|
||||
@param funSym Symbol that corresponds to the function
|
||||
@param llvmFunction LLVM function in the current module that corresponds
|
||||
@param returnType The return type of the function
|
||||
@param function LLVM function in the current module that corresponds
|
||||
to the function
|
||||
@param funSym Symbol that corresponds to the function
|
||||
@param firstStmtPos Source file position of the first statement in the
|
||||
function
|
||||
*/
|
||||
FunctionEmitContext(Function *function, Symbol *funSym,
|
||||
llvm::Function *llvmFunction,
|
||||
FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
|
||||
SourcePos firstStmtPos);
|
||||
~FunctionEmitContext();
|
||||
|
||||
/** Returns the Function * corresponding to the function that we're
|
||||
currently generating code for. */
|
||||
const Function *GetFunction() const;
|
||||
|
||||
/** @name Current basic block management
|
||||
@{
|
||||
*/
|
||||
@@ -86,33 +83,20 @@ public:
|
||||
/** @name Mask management
|
||||
@{
|
||||
*/
|
||||
/** Returns the mask value at entry to the current function. */
|
||||
llvm::Value *GetFunctionMask();
|
||||
|
||||
/** Returns the mask value corresponding to "varying" control flow
|
||||
within the current function. (i.e. this doesn't include the effect
|
||||
of the mask at function entry. */
|
||||
llvm::Value *GetInternalMask();
|
||||
|
||||
/** Returns the complete current mask value--i.e. the logical AND of
|
||||
the function entry mask and the internal mask. */
|
||||
llvm::Value *GetFullMask();
|
||||
|
||||
/** Provides the alloca'd pointer to memory to store the full function
|
||||
mask. This is only used to wire up the __mask builtin variable. */
|
||||
void SetMaskPointer(llvm::Value *p);
|
||||
/** Returns the current mask value */
|
||||
llvm::Value *GetMask();
|
||||
|
||||
/** Provides the value of the mask at function entry */
|
||||
void SetFunctionMask(llvm::Value *val);
|
||||
void SetEntryMask(llvm::Value *val);
|
||||
|
||||
/** Sets the internal mask to a new value */
|
||||
void SetInternalMask(llvm::Value *val);
|
||||
/** Sets the mask to a new value */
|
||||
void SetMask(llvm::Value *val);
|
||||
|
||||
/** Sets the internal mask to (oldMask & val) */
|
||||
void SetInternalMaskAnd(llvm::Value *oldMask, llvm::Value *val);
|
||||
/** Sets the mask to (oldMask & val) */
|
||||
void MaskAnd(llvm::Value *oldMask, llvm::Value *val);
|
||||
|
||||
/** Sets the internal mask to (oldMask & ~val) */
|
||||
void SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *test);
|
||||
/** Sets the mask to (oldMask & ~val) */
|
||||
void MaskAndNot(llvm::Value *oldMask, llvm::Value *test);
|
||||
|
||||
/** Emits a branch instruction to the basic block btrue if any of the
|
||||
lanes of current mask are on and bfalse if none are on. */
|
||||
@@ -131,8 +115,9 @@ public:
|
||||
@{
|
||||
*/
|
||||
/** Notifies the FunctionEmitContext that we're starting emission of an
|
||||
'if' statement with a uniform test. */
|
||||
void StartUniformIf();
|
||||
'if' statement with a uniform test. The value of the mask going
|
||||
into the 'if' statement is provided in the oldMask parameter. */
|
||||
void StartUniformIf(llvm::Value *oldMask);
|
||||
|
||||
/** Notifies the FunctionEmitContext that we're starting emission of an
|
||||
'if' statement with a varying test. The value of the mask going
|
||||
@@ -147,9 +132,10 @@ public:
|
||||
for a loop. Basic blocks are provides for where 'break' and
|
||||
'continue' statements should jump to (if all running lanes want to
|
||||
break or continue), uniformControlFlow indicates whether the loop
|
||||
condition is 'uniform'. */
|
||||
condition is 'uniform', and oldMask provides the current mask going
|
||||
into the loop. */
|
||||
void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
|
||||
bool uniformControlFlow);
|
||||
bool uniformControlFlow, llvm::Value *oldMask);
|
||||
|
||||
/** Informs FunctionEmitContext of the value of the mask at the start
|
||||
of a loop body. */
|
||||
@@ -159,13 +145,6 @@ public:
|
||||
finished. */
|
||||
void EndLoop();
|
||||
|
||||
/** Indicates that code generation for a 'foreach' or 'foreach_tiled'
|
||||
loop is about to start. The provided basic block pointer indicates
|
||||
where control flow should go if a 'continue' statement is executed
|
||||
in the loop. */
|
||||
void StartForeach(llvm::BasicBlock *continueTarget);
|
||||
void EndForeach();
|
||||
|
||||
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
|
||||
is true, then if we're in a 'varying' loop, code will be emitted to
|
||||
see if all of the lanes want to break, in which case a jump to the
|
||||
@@ -190,8 +169,6 @@ public:
|
||||
flow */
|
||||
int VaryingCFDepth() const;
|
||||
|
||||
bool InForeachLoop() const;
|
||||
|
||||
/** Called to generate code for 'return' statement; value is the
|
||||
expression in the return statement (if non-NULL), and
|
||||
doCoherenceCheck indicates whether instructions should be generated
|
||||
@@ -233,6 +210,9 @@ public:
|
||||
i32. */
|
||||
llvm::Value *I1VecToBoolVec(llvm::Value *b);
|
||||
|
||||
/** Returns the size of the given type. */
|
||||
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
|
||||
|
||||
/** If the user has asked to compile the program with instrumentation,
|
||||
this inserts a callback to the user-supplied instrumentation
|
||||
function at the current point in the code. */
|
||||
@@ -316,18 +296,12 @@ public:
|
||||
llvm::CmpInst::Predicate pred,
|
||||
llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
|
||||
|
||||
/** Given a scalar value, return a vector of the same type (or an
|
||||
array, for pointer types). */
|
||||
llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
|
||||
|
||||
llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
|
||||
llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
|
||||
@@ -339,37 +313,26 @@ public:
|
||||
llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
/** These GEP methods are generalizations of the standard ones in LLVM;
|
||||
they support both uniform and varying basePtr values as well as
|
||||
uniform and varying index values (arrays of indices). Varying base
|
||||
pointers are expected to come in as vectors of i32/i64 (depending
|
||||
on the target), since LLVM doesn't currently support vectors of
|
||||
pointers. The underlying type of the base pointer must be provided
|
||||
via the ptrType parameter */
|
||||
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
|
||||
const Type *ptrType, const char *name = NULL);
|
||||
/** This GEP method is a generalization of the standard one in LLVM; it
|
||||
supports both uniform and varying basePtr values (an array of
|
||||
pointers) as well as uniform and varying index values (arrays of
|
||||
indices). */
|
||||
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
|
||||
llvm::Value *index1, const Type *ptrType,
|
||||
llvm::Value *index1, const char *name = NULL);
|
||||
|
||||
/** This is a convenience method to generate a GEP instruction with
|
||||
indices with values with known constant values as the ispc program
|
||||
is being compiled. */
|
||||
llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
|
||||
const char *name = NULL);
|
||||
|
||||
/** This method returns a new pointer that represents offsetting the
|
||||
given base pointer to point at the given element number of the
|
||||
structure type that the base pointer points to. (The provided
|
||||
pointer must be a pointer to a structure type. The ptrType gives
|
||||
the type of the pointer, though it may be NULL if the base pointer
|
||||
is uniform. */
|
||||
llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
|
||||
const Type *ptrType, const char *name = NULL);
|
||||
|
||||
/** Load from the memory location(s) given by lvalue, using the given
|
||||
mask. The lvalue may be varying, in which case this corresponds to
|
||||
a gather from the multiple memory locations given by the array of
|
||||
pointer values given by the lvalue. If the lvalue is not varying,
|
||||
then both the mask pointer and the type pointer may be NULL. */
|
||||
llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
|
||||
const Type *ptrType, const char *name = NULL);
|
||||
|
||||
llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);
|
||||
/** Load from the memory location(s) given by lvalue. The lvalue may
|
||||
be varying, in which case this corresponds to a gather from the
|
||||
multiple memory locations given by the array of pointer values
|
||||
given by the lvalue. If the lvalue is not varying, then the type
|
||||
parameter may be NULL. */
|
||||
llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type,
|
||||
const char *name = NULL);
|
||||
|
||||
/** Emits an alloca instruction to allocate stack storage for the given
|
||||
type. If a non-zero alignment is specified, the object is also
|
||||
@@ -377,20 +340,21 @@ public:
|
||||
instruction is added at the start of the function in the entry
|
||||
basic block; if it should be added to the current basic block, then
|
||||
the atEntryBlock parameter should be false. */
|
||||
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType,
|
||||
const char *name = NULL, int align = 0,
|
||||
bool atEntryBlock = true);
|
||||
llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
|
||||
int align = 0, bool atEntryBlock = true);
|
||||
|
||||
/** Standard store instruction; for this variant, the lvalue must be a
|
||||
single pointer, not a varying lvalue. */
|
||||
void StoreInst(llvm::Value *value, llvm::Value *ptr);
|
||||
void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
const char *name = NULL);
|
||||
|
||||
/** In this variant of StoreInst(), the lvalue may be varying. If so,
|
||||
this corresponds to a scatter. Whether the lvalue is uniform of
|
||||
varying, the given storeMask is used to mask the stores so that
|
||||
they only execute for the active program instances. */
|
||||
void StoreInst(llvm::Value *value, llvm::Value *ptr,
|
||||
llvm::Value *storeMask, const Type *ptrType);
|
||||
void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
llvm::Value *storeMask, const Type *rvalueType,
|
||||
const char *name = NULL);
|
||||
|
||||
void BranchInst(llvm::BasicBlock *block);
|
||||
void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
|
||||
@@ -412,30 +376,24 @@ public:
|
||||
llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
|
||||
llvm::Value *val1, const char *name = NULL);
|
||||
|
||||
/** Emits IR to do a function call with the given arguments. If the
|
||||
function type is a varying function pointer type, its full type
|
||||
must be provided in funcType. funcType can be NULL if func is a
|
||||
uniform function pointer. */
|
||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
||||
const std::vector<llvm::Value *> &args,
|
||||
const char *name = NULL);
|
||||
|
||||
llvm::Instruction *CallInst(llvm::Function *func,
|
||||
const std::vector<llvm::Value *> &args,
|
||||
const char *name = NULL);
|
||||
/** This is a convenience method that issues a call instruction to a
|
||||
function that takes just a single argument. */
|
||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
||||
llvm::Value *arg, const char *name = NULL);
|
||||
llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg,
|
||||
const char *name = NULL);
|
||||
|
||||
/** This is a convenience method that issues a call instruction to a
|
||||
function that takes two arguments. */
|
||||
llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
|
||||
llvm::Value *arg0, llvm::Value *arg1,
|
||||
const char *name = NULL);
|
||||
llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0,
|
||||
llvm::Value *arg1, const char *name = NULL);
|
||||
|
||||
/** Launch an asynchronous task to run the given function, passing it
|
||||
he given argument values. */
|
||||
llvm::Value *LaunchInst(llvm::Value *callee,
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount);
|
||||
llvm::Instruction *LaunchInst(llvm::Function *callee,
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount);
|
||||
|
||||
void SyncInst();
|
||||
|
||||
@@ -443,9 +401,6 @@ public:
|
||||
/** @} */
|
||||
|
||||
private:
|
||||
/** Pointer to the Function for which we're currently generating code. */
|
||||
Function *function;
|
||||
|
||||
/** The basic block into which we add any alloca instructions that need
|
||||
to go at the very start of the function. */
|
||||
llvm::BasicBlock *allocaBlock;
|
||||
@@ -455,16 +410,8 @@ private:
|
||||
llvm::BasicBlock *bblock;
|
||||
|
||||
/** Pointer to stack-allocated memory that stores the current value of
|
||||
the full program mask. */
|
||||
llvm::Value *fullMaskPointer;
|
||||
|
||||
/** Pointer to stack-allocated memory that stores the current value of
|
||||
the program mask representing varying control flow within the
|
||||
function. */
|
||||
llvm::Value *internalMaskPointer;
|
||||
|
||||
/** Value of the program mask when the function starts execution. */
|
||||
llvm::Value *functionMaskValue;
|
||||
the program mask. */
|
||||
llvm::Value *maskPtr;
|
||||
|
||||
/** Current source file position; if debugging information is being
|
||||
generated, this position is used to set file/line information for
|
||||
@@ -475,6 +422,12 @@ private:
|
||||
for error messages and debugging symbols. */
|
||||
SourcePos funcStartPos;
|
||||
|
||||
/** Type of result that the current function returns. */
|
||||
const Type *returnType;
|
||||
|
||||
/** Value of the program mask when the function starts execution. */
|
||||
llvm::Value *entryMask;
|
||||
|
||||
/** If currently in a loop body, the value of the mask at the start of
|
||||
the loop. */
|
||||
llvm::Value *loopMask;
|
||||
@@ -538,23 +491,19 @@ private:
|
||||
llvm::Value *launchGroupHandlePtr;
|
||||
|
||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||
static void addGSMetadata(llvm::Value *inst, SourcePos pos);
|
||||
static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
|
||||
bool ifsInLoopAllUniform() const;
|
||||
void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
|
||||
llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
|
||||
|
||||
llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
|
||||
const Type *ptrType);
|
||||
|
||||
void restoreMaskGivenReturns(llvm::Value *oldMask);
|
||||
|
||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
|
||||
void scatter(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
llvm::Value *maskPtr, const Type *rvalueType);
|
||||
llvm::Value *gather(llvm::Value *lvalue, const Type *type,
|
||||
const char *name);
|
||||
llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
|
||||
void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
|
||||
const Type *rvalueType, llvm::Value *maskPtr);
|
||||
};
|
||||
|
||||
#endif // ISPC_CTX_H
|
||||
|
||||
583
decl.cpp
583
decl.cpp
@@ -38,59 +38,10 @@
|
||||
|
||||
#include "decl.h"
|
||||
#include "util.h"
|
||||
#include "module.h"
|
||||
#include "sym.h"
|
||||
#include "type.h"
|
||||
#include "stmt.h"
|
||||
#include "expr.h"
|
||||
#include <stdio.h>
|
||||
#include <set>
|
||||
|
||||
/** Given a Type and a set of type qualifiers, apply the type qualifiers to
|
||||
the type, returning the type that is the result.
|
||||
*/
|
||||
static const Type *
|
||||
lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
if (type == NULL)
|
||||
return NULL;
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
|
||||
Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
|
||||
"qualifiers.");
|
||||
|
||||
const Type *unsignedType = type->GetAsUnsignedType();
|
||||
if (unsignedType != NULL)
|
||||
type = unsignedType;
|
||||
else
|
||||
Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
||||
type->GetString().c_str());
|
||||
|
||||
}
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
|
||||
Error(pos, "\"signed\" qualifier is illegal with non-integer type "
|
||||
"\"%s\".", type->GetString().c_str());
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
|
||||
type = type->GetAsUniformType();
|
||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
|
||||
type = type->GetAsVaryingType();
|
||||
else {
|
||||
// otherwise, structs are uniform by default and everything
|
||||
// else is varying by default
|
||||
if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
|
||||
type = type->GetAsUniformType();
|
||||
else
|
||||
type = type->GetAsVaryingType();
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// DeclSpecs
|
||||
@@ -98,57 +49,29 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
|
||||
baseType = t;
|
||||
storageClass = sc;
|
||||
typeQualifiers = tq;
|
||||
typeQualifier = tq;
|
||||
soaWidth = 0;
|
||||
vectorSize = 0;
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
DeclSpecs::GetBaseType(SourcePos pos) const {
|
||||
const Type *bt = baseType;
|
||||
if (vectorSize > 0) {
|
||||
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
|
||||
if (atomicType == NULL) {
|
||||
Error(pos, "Only atomic types (int, float, ...) are legal for vector "
|
||||
"types.");
|
||||
return NULL;
|
||||
}
|
||||
bt = new VectorType(atomicType, vectorSize);
|
||||
}
|
||||
|
||||
return lApplyTypeQualifiers(typeQualifiers, bt, pos);
|
||||
}
|
||||
|
||||
|
||||
static const char *
|
||||
lGetStorageClassName(StorageClass storageClass) {
|
||||
switch (storageClass) {
|
||||
case SC_NONE: return "";
|
||||
case SC_EXTERN: return "extern";
|
||||
case SC_EXTERN_C: return "extern \"C\"";
|
||||
case SC_EXPORT: return "export";
|
||||
case SC_STATIC: return "static";
|
||||
case SC_TYPEDEF: return "typedef";
|
||||
default: FATAL("Unhandled storage class in lGetStorageClassName");
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DeclSpecs::Print() const {
|
||||
printf("%s ", lGetStorageClassName(storageClass));
|
||||
if (storageClass == SC_EXTERN) printf("extern ");
|
||||
if (storageClass == SC_EXTERN_C) printf("extern \"C\" ");
|
||||
if (storageClass == SC_EXPORT) printf("export ");
|
||||
if (storageClass == SC_STATIC) printf("static ");
|
||||
if (storageClass == SC_TYPEDEF) printf("typedef ");
|
||||
|
||||
if (soaWidth > 0) printf("soa<%d> ", soaWidth);
|
||||
|
||||
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
|
||||
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
|
||||
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
|
||||
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
|
||||
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
if (typeQualifier & TYPEQUAL_INLINE) printf("inline ");
|
||||
if (typeQualifier & TYPEQUAL_CONST) printf("const ");
|
||||
if (typeQualifier & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||
if (typeQualifier & TYPEQUAL_VARYING) printf("varying ");
|
||||
if (typeQualifier & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference ");
|
||||
if (typeQualifier & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
|
||||
printf("%s", baseType->GetString().c_str());
|
||||
|
||||
@@ -159,46 +82,34 @@ DeclSpecs::Print() const {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Declarator
|
||||
|
||||
Declarator::Declarator(DeclaratorKind dk, SourcePos p)
|
||||
: pos(p), kind(dk) {
|
||||
child = NULL;
|
||||
typeQualifiers = 0;
|
||||
arraySize = -1;
|
||||
sym = NULL;
|
||||
Declarator::Declarator(Symbol *s, SourcePos p)
|
||||
: pos(p) {
|
||||
sym = s;
|
||||
functionArgs = NULL;
|
||||
isFunction = false;
|
||||
initExpr = NULL;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
|
||||
const Type *t = GetType(ds);
|
||||
Symbol *sym = GetSymbol();
|
||||
if (sym != NULL) {
|
||||
sym->type = t;
|
||||
sym->storageClass = ds->storageClass;
|
||||
}
|
||||
Declarator::AddArrayDimension(int size) {
|
||||
assert(size > 0 || size == -1); // -1 -> unsized
|
||||
arraySize.push_back(size);
|
||||
}
|
||||
|
||||
|
||||
Symbol *
|
||||
Declarator::GetSymbol() const {
|
||||
// The symbol lives at the last child in the chain, so walk down there
|
||||
// and return the one there.
|
||||
const Declarator *d = this;
|
||||
while (d->child != NULL)
|
||||
d = d->child;
|
||||
return d->sym;
|
||||
void
|
||||
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
|
||||
sym->type = GetType(ds);
|
||||
|
||||
if (ds->storageClass == SC_STATIC)
|
||||
sym->isStatic = true;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declarator::Print() const {
|
||||
Symbol *sym = GetSymbol();
|
||||
if (sym != NULL)
|
||||
printf("%s", sym->name.c_str());
|
||||
else
|
||||
printf("(null symbol)");
|
||||
|
||||
printf("%s", sym->name.c_str());
|
||||
if (initExpr != NULL) {
|
||||
printf(" = (");
|
||||
initExpr->Print();
|
||||
@@ -208,305 +119,188 @@ Declarator::Print() const {
|
||||
}
|
||||
|
||||
|
||||
Symbol *
|
||||
Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
|
||||
const FunctionType *type =
|
||||
dynamic_cast<const FunctionType *>(GetType(ds));
|
||||
if (type == NULL)
|
||||
return NULL;
|
||||
|
||||
Symbol *declSym = GetSymbol();
|
||||
assert(declSym != NULL);
|
||||
|
||||
// Get the symbol for the function from the symbol table. (It should
|
||||
// already have been added to the symbol table by AddGlobal() by the
|
||||
// time we get here.)
|
||||
Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
|
||||
if (funSym != NULL)
|
||||
// May be NULL due to error earlier in compilation
|
||||
funSym->pos = pos;
|
||||
|
||||
// Walk down to the declarator for the function. (We have to get past
|
||||
// the stuff that specifies the function's return type before we get to
|
||||
// the function's declarator.)
|
||||
Declarator *d = this;
|
||||
while (d != NULL && d->kind != DK_FUNCTION)
|
||||
d = d->child;
|
||||
assert(d != NULL);
|
||||
|
||||
for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
|
||||
Declaration *pdecl = d->functionParams[i];
|
||||
assert(pdecl->declarators.size() == 1);
|
||||
funArgs->push_back(pdecl->declarators[0]->GetSymbol());
|
||||
}
|
||||
|
||||
return funSym;
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
|
||||
bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
|
||||
bool isTask = ((typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isConst = ((typeQualifiers & TYPEQUAL_CONST) != 0);
|
||||
|
||||
if (hasUniformQual && hasVaryingQual) {
|
||||
Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
|
||||
return NULL;
|
||||
}
|
||||
if (kind != DK_FUNCTION && isTask)
|
||||
Error(pos, "\"task\" qualifier illegal in variable declaration.");
|
||||
|
||||
const Type *type = base;
|
||||
switch (kind) {
|
||||
case DK_BASE:
|
||||
// All of the type qualifiers should be in the DeclSpecs for the
|
||||
// base declarator
|
||||
assert(typeQualifiers == 0);
|
||||
assert(child == NULL);
|
||||
return type;
|
||||
|
||||
case DK_POINTER:
|
||||
type = new PointerType(type, hasUniformQual, isConst);
|
||||
if (child != NULL)
|
||||
return child->GetType(type, ds);
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
|
||||
case DK_REFERENCE:
|
||||
if (hasUniformQual)
|
||||
Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
|
||||
if (hasVaryingQual)
|
||||
Error(pos, "\"varying\" qualifier is illegal to apply to references.");
|
||||
if (isConst)
|
||||
Error(pos, "\"const\" qualifier is to illegal apply to references.");
|
||||
|
||||
// The parser should disallow this already, but double check.
|
||||
if (dynamic_cast<const ReferenceType *>(type) != NULL) {
|
||||
Error(pos, "References to references are illegal.");
|
||||
static const Type *
|
||||
lGetType(const Declarator *decl, DeclSpecs *ds,
|
||||
std::vector<int>::const_iterator arrayIter) {
|
||||
if (arrayIter == decl->arraySize.end()) {
|
||||
// If we don't have an array (or have processed all of the array
|
||||
// dimensions in previous recursive calls), we can go ahead and
|
||||
// figure out the final non-array type we have here.
|
||||
const Type *type = ds->baseType;
|
||||
if (type == NULL) {
|
||||
Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".",
|
||||
decl->sym->name.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
type = new ReferenceType(type);
|
||||
if (child != NULL)
|
||||
return child->GetType(type, ds);
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
// Account for 'unsigned' and 'const' qualifiers in the type
|
||||
if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) {
|
||||
const Type *unsignedType = type->GetAsUnsignedType();
|
||||
if (unsignedType != NULL)
|
||||
type = unsignedType;
|
||||
else
|
||||
Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
||||
type->GetString().c_str());
|
||||
}
|
||||
if ((ds->typeQualifier & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
case DK_ARRAY:
|
||||
type = new ArrayType(type, arraySize);
|
||||
if (child)
|
||||
return child->GetType(type, ds);
|
||||
else
|
||||
return type;
|
||||
break;
|
||||
|
||||
case DK_FUNCTION: {
|
||||
std::vector<const Type *> args;
|
||||
std::vector<std::string> argNames;
|
||||
std::vector<ConstExpr *> argDefaults;
|
||||
std::vector<SourcePos> argPos;
|
||||
|
||||
// Loop over the function arguments and store the names, types,
|
||||
// default values (if any), and source file positions each one in
|
||||
// the corresponding vector.
|
||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
||||
Declaration *d = functionParams[i];
|
||||
|
||||
char buf[32];
|
||||
Symbol *sym;
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for
|
||||
// the parameter
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, pos);
|
||||
sym->type = d->declSpecs->GetBaseType(pos);
|
||||
if (ds->vectorSize > 0) {
|
||||
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
|
||||
if (atomicType == NULL) {
|
||||
Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector "
|
||||
"types.");
|
||||
return NULL;
|
||||
}
|
||||
else {
|
||||
sym = d->declarators[0]->GetSymbol();
|
||||
if (sym == NULL) {
|
||||
// Handle more complex anonymous declarations like
|
||||
// float (float **).
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, d->declarators[0]->pos);
|
||||
sym->type = d->declarators[0]->GetType(d->declSpecs);
|
||||
}
|
||||
}
|
||||
|
||||
if (d->declSpecs->storageClass != SC_NONE)
|
||||
Error(sym->pos, "Storage class \"%s\" is illegal in "
|
||||
"function parameter declaration for parameter \"%s\".",
|
||||
lGetStorageClassName(d->declSpecs->storageClass),
|
||||
sym->name.c_str());
|
||||
|
||||
const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
|
||||
if (at != NULL) {
|
||||
// As in C, arrays are passed to functions as pointers to
|
||||
// their element type. We'll just immediately make this
|
||||
// change now. (One shortcoming of losing the fact that
|
||||
// the it was originally an array is that any warnings or
|
||||
// errors later issued that print the function type will
|
||||
// report this differently than it was originally declared
|
||||
// in the function, but it's not clear that this is a
|
||||
// significant problem.)
|
||||
sym->type = PointerType::GetUniform(at->GetElementType());
|
||||
|
||||
// Make sure there are no unsized arrays (other than the
|
||||
// first dimension) in function parameter lists.
|
||||
at = dynamic_cast<const ArrayType *>(at->GetElementType());
|
||||
while (at != NULL) {
|
||||
if (at->GetElementCount() == 0)
|
||||
Error(sym->pos, "Arrays with unsized dimensions in "
|
||||
"dimensions after the first one are illegal in "
|
||||
"function parameter lists.");
|
||||
at = dynamic_cast<const ArrayType *>(at->GetElementType());
|
||||
}
|
||||
}
|
||||
|
||||
args.push_back(sym->type);
|
||||
argNames.push_back(sym->name);
|
||||
argPos.push_back(sym->pos);
|
||||
|
||||
ConstExpr *init = NULL;
|
||||
if (d->declarators.size()) {
|
||||
// Try to find an initializer expression; if there is one,
|
||||
// it lives down to the base declarator.
|
||||
Declarator *decl = d->declarators[0];
|
||||
while (decl->child != NULL) {
|
||||
assert(decl->initExpr == NULL);
|
||||
decl = decl->child;
|
||||
}
|
||||
|
||||
if (decl->initExpr != NULL &&
|
||||
(decl->initExpr = decl->initExpr->TypeCheck()) != NULL &&
|
||||
(decl->initExpr = decl->initExpr->Optimize()) != NULL &&
|
||||
(init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
|
||||
Error(decl->initExpr->pos, "Default value for parameter "
|
||||
"\"%s\" must be a compile-time constant.",
|
||||
sym->name.c_str());
|
||||
}
|
||||
}
|
||||
argDefaults.push_back(init);
|
||||
type = new VectorType(atomicType, ds->vectorSize);
|
||||
}
|
||||
|
||||
const Type *returnType = type;
|
||||
if (returnType == NULL) {
|
||||
Error(pos, "No return type provided in function declaration.");
|
||||
return NULL;
|
||||
// if uniform/varying is specified explicitly, then go with that
|
||||
if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0)
|
||||
return type->GetAsUniformType();
|
||||
else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0)
|
||||
return type->GetAsVaryingType();
|
||||
else {
|
||||
// otherwise, structs are uniform by default and everything
|
||||
// else is varying by default
|
||||
if (dynamic_cast<const StructType *>(type) != NULL)
|
||||
return type->GetAsUniformType();
|
||||
else
|
||||
return type->GetAsVaryingType();
|
||||
}
|
||||
|
||||
bool isExported = ds && (ds->storageClass == SC_EXPORT);
|
||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
|
||||
if (isExported && isTask) {
|
||||
Error(pos, "Function can't have both \"task\" and \"export\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
}
|
||||
if (isExternC && isTask) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
}
|
||||
if (isExternC && isExported) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
|
||||
"qualifiers");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Type *functionType =
|
||||
new FunctionType(returnType, args, pos, argNames, argDefaults,
|
||||
argPos, isTask, isExported, isExternC);
|
||||
return child->GetType(functionType, ds);
|
||||
}
|
||||
default:
|
||||
FATAL("Unexpected decl kind");
|
||||
return NULL;
|
||||
}
|
||||
else {
|
||||
// Peel off one dimension of the array
|
||||
int arraySize = *arrayIter;
|
||||
++arrayIter;
|
||||
|
||||
#if 0
|
||||
// Get the type, not including the arraySize dimension peeled off
|
||||
// above.
|
||||
const Type *childType = lGetType(decl, ds, arrayIter);
|
||||
|
||||
int soaWidth = ds->soaWidth;
|
||||
if (soaWidth == 0)
|
||||
// If there's no "soa<n>" stuff going on, just return a regular
|
||||
// array with the appropriate size
|
||||
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
||||
else {
|
||||
// Make sure we actually have an array of structs ..
|
||||
const StructType *childStructType =
|
||||
dynamic_cast<const StructType *>(childType);
|
||||
if (childStructType == NULL) {
|
||||
Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||
Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct "
|
||||
"type \"%s\".", soaWidth, childType->GetString().c_str());
|
||||
return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
|
||||
}
|
||||
else if ((soaWidth & (soaWidth - 1)) != 0) {
|
||||
Error(pos, "soa<%d> width illegal. Value must be power of two.",
|
||||
Error(decl->pos, "soa<%d> width illegal. Value must be power of two.",
|
||||
soaWidth);
|
||||
return NULL;
|
||||
}
|
||||
else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
|
||||
Error(pos, "soa<%d> width must evenly divide array size %d.",
|
||||
Error(decl->pos, "soa<%d> width must evenly divide array size %d.",
|
||||
soaWidth, arraySize);
|
||||
return NULL;
|
||||
}
|
||||
return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
|
||||
soaWidth);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Declarator::GetType(DeclSpecs *ds) const {
|
||||
const Type *baseType = ds->GetBaseType(pos);
|
||||
const Type *type = GetType(baseType, ds);
|
||||
return type;
|
||||
}
|
||||
bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0);
|
||||
bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0);
|
||||
bool isTask = ((ds->typeQualifier & TYPEQUAL_TASK) != 0);
|
||||
bool isReference = ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0);
|
||||
|
||||
if (hasUniformQual && hasVaryingQual) {
|
||||
Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (isFunction) {
|
||||
std::vector<const Type *> args;
|
||||
std::vector<std::string> argNames;
|
||||
if (functionArgs) {
|
||||
// Loop over the function arguments and get names and types for
|
||||
// each one in the args and argNames arrays
|
||||
for (unsigned int i = 0; i < functionArgs->size(); ++i) {
|
||||
Declaration *d = (*functionArgs)[i];
|
||||
Symbol *sym;
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for
|
||||
// the parameter
|
||||
char buf[32];
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, pos);
|
||||
Declarator *declarator = new Declarator(sym, sym->pos);
|
||||
sym->type = declarator->GetType(d->declSpecs);
|
||||
d->declarators.push_back(declarator);
|
||||
}
|
||||
else {
|
||||
assert(d->declarators.size() == 1);
|
||||
sym = d->declarators[0]->sym;
|
||||
}
|
||||
|
||||
// Arrays are passed by reference, so convert array
|
||||
// parameters to be references here.
|
||||
if (dynamic_cast<const ArrayType *>(sym->type) != NULL)
|
||||
sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
|
||||
|
||||
args.push_back(sym->type);
|
||||
argNames.push_back(sym->name);
|
||||
}
|
||||
}
|
||||
|
||||
if (ds->baseType == NULL) {
|
||||
Warning(pos, "No return type provided in declaration of function \"%s\". "
|
||||
"Treating as \"void\".", sym->name.c_str());
|
||||
ds->baseType = AtomicType::Void;
|
||||
}
|
||||
|
||||
if (isReference) {
|
||||
Error(pos, "Function return types can't be reference types.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const Type *returnType = lGetType(this, ds, arraySize.begin());
|
||||
if (returnType == NULL)
|
||||
return NULL;
|
||||
|
||||
bool isExported = (ds->storageClass == SC_EXPORT);
|
||||
bool isExternC = (ds->storageClass == SC_EXTERN_C);
|
||||
return new FunctionType(returnType, args, pos, &argNames, isTask,
|
||||
isExported, isExternC);
|
||||
}
|
||||
else {
|
||||
if (isTask)
|
||||
Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".",
|
||||
sym->name.c_str());
|
||||
|
||||
const Type *type = lGetType(this, ds, arraySize.begin());
|
||||
|
||||
if (type != NULL && isReference) {
|
||||
bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0);
|
||||
type = new ReferenceType(type, hasConstQual);
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Declaration
|
||||
|
||||
Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
|
||||
declSpecs = ds;
|
||||
if (dlist != NULL)
|
||||
declarators = *dlist;
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i)
|
||||
if (declarators[i] != NULL)
|
||||
declarators[i]->InitFromDeclSpecs(declSpecs);
|
||||
}
|
||||
|
||||
|
||||
Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
|
||||
declSpecs = ds;
|
||||
if (d != NULL) {
|
||||
d->InitFromDeclSpecs(ds);
|
||||
declarators.push_back(d);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::vector<VariableDeclaration>
|
||||
Declaration::GetVariableDeclarations() const {
|
||||
void
|
||||
Declaration::AddSymbols(SymbolTable *st) const {
|
||||
assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||
std::vector<VariableDeclaration> vars;
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
if (declarators[i] == NULL)
|
||||
continue;
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL)
|
||||
// Ignore earlier errors
|
||||
continue;
|
||||
|
||||
Symbol *sym = decl->GetSymbol();
|
||||
if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
|
||||
// function declaration
|
||||
m->symbolTable->AddFunction(sym);
|
||||
}
|
||||
else {
|
||||
m->symbolTable->AddVariable(sym);
|
||||
vars.push_back(VariableDeclaration(sym, decl->initExpr));
|
||||
}
|
||||
}
|
||||
return vars;
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i)
|
||||
if (declarators[i])
|
||||
st->AddVariable(declarators[i]->sym);
|
||||
}
|
||||
|
||||
|
||||
@@ -528,44 +322,29 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
std::vector<const Type *> *elementTypes,
|
||||
std::vector<std::string> *elementNames,
|
||||
std::vector<SourcePos> *elementPositions) {
|
||||
std::set<std::string> seenNames;
|
||||
for (unsigned int i = 0; i < sd.size(); ++i) {
|
||||
const Type *type = sd[i]->type;
|
||||
if (type == NULL)
|
||||
continue;
|
||||
|
||||
// FIXME: making this fake little DeclSpecs here is really
|
||||
// disgusting
|
||||
DeclSpecs ds(type);
|
||||
if (type->IsUniformType())
|
||||
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
|
||||
ds.typeQualifier |= TYPEQUAL_UNIFORM;
|
||||
else
|
||||
ds.typeQualifiers |= TYPEQUAL_VARYING;
|
||||
ds.typeQualifier |= TYPEQUAL_VARYING;
|
||||
|
||||
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
|
||||
Declarator *d = (*sd[i]->declarators)[j];
|
||||
d->InitFromDeclSpecs(&ds);
|
||||
|
||||
Symbol *sym = d->GetSymbol();
|
||||
// if it's an unsized array, make it a reference to an unsized
|
||||
// array, so the caller can pass a pointer...
|
||||
const ArrayType *at = dynamic_cast<const ArrayType *>(d->sym->type);
|
||||
if (at && at->GetElementCount() == 0)
|
||||
d->sym->type = new ReferenceType(d->sym->type, type->IsConstType());
|
||||
|
||||
const ArrayType *arrayType =
|
||||
dynamic_cast<const ArrayType *>(sym->type);
|
||||
if (arrayType != NULL && arrayType->GetElementCount() == 0) {
|
||||
Error(d->pos, "Unsized arrays aren't allowed in struct "
|
||||
"definitions.");
|
||||
elementTypes->push_back(NULL);
|
||||
}
|
||||
else
|
||||
elementTypes->push_back(sym->type);
|
||||
|
||||
if (seenNames.find(sym->name) != seenNames.end())
|
||||
Error(d->pos, "Struct member \"%s\" has same name as a "
|
||||
"previously-declared member.", sym->name.c_str());
|
||||
else
|
||||
seenNames.insert(sym->name);
|
||||
|
||||
elementNames->push_back(sym->name);
|
||||
elementPositions->push_back(sym->pos);
|
||||
elementTypes->push_back(d->sym->type);
|
||||
elementNames->push_back(d->sym->name);
|
||||
elementPositions->push_back(d->sym->pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
100
decl.h
100
decl.h
@@ -56,11 +56,6 @@
|
||||
|
||||
#include "ispc.h"
|
||||
|
||||
struct VariableDeclaration;
|
||||
|
||||
class Declaration;
|
||||
class Declarator;
|
||||
|
||||
enum StorageClass {
|
||||
SC_NONE,
|
||||
SC_EXTERN,
|
||||
@@ -79,7 +74,7 @@ enum StorageClass {
|
||||
#define TYPEQUAL_UNIFORM (1<<1)
|
||||
#define TYPEQUAL_VARYING (1<<2)
|
||||
#define TYPEQUAL_TASK (1<<3)
|
||||
#define TYPEQUAL_SIGNED (1<<4)
|
||||
#define TYPEQUAL_REFERENCE (1<<4)
|
||||
#define TYPEQUAL_UNSIGNED (1<<5)
|
||||
#define TYPEQUAL_INLINE (1<<6)
|
||||
|
||||
@@ -97,17 +92,15 @@ public:
|
||||
StorageClass storageClass;
|
||||
|
||||
/** Zero or more of the TYPEQUAL_* values, ANDed together. */
|
||||
int typeQualifiers;
|
||||
int typeQualifier;
|
||||
|
||||
/** The basic type provided in the declaration; this should be an
|
||||
AtomicType, EnumType, StructType, or VectorType; other types (like
|
||||
AtomicType, a StructType, or a VectorType; other types (like
|
||||
ArrayTypes) will end up being created if a particular declaration
|
||||
has an array size, etc.
|
||||
*/
|
||||
const Type *baseType;
|
||||
|
||||
const Type *GetBaseType(SourcePos pos) const;
|
||||
|
||||
/** If this is a declaration with a vector type, this gives the vector
|
||||
width. For non-vector types, this is zero.
|
||||
*/
|
||||
@@ -120,14 +113,6 @@ public:
|
||||
};
|
||||
|
||||
|
||||
enum DeclaratorKind {
|
||||
DK_BASE,
|
||||
DK_POINTER,
|
||||
DK_REFERENCE,
|
||||
DK_ARRAY,
|
||||
DK_FUNCTION
|
||||
};
|
||||
|
||||
/** @brief Representation of the declaration of a single variable.
|
||||
|
||||
In conjunction with an instance of the DeclSpecs, this gives us
|
||||
@@ -135,7 +120,13 @@ enum DeclaratorKind {
|
||||
*/
|
||||
class Declarator {
|
||||
public:
|
||||
Declarator(DeclaratorKind dk, SourcePos p);
|
||||
Declarator(Symbol *s, SourcePos p);
|
||||
|
||||
/** As the parser peels off array dimension declarations after the
|
||||
symbol name, it calls this method to provide them to the
|
||||
Declarator.
|
||||
*/
|
||||
void AddArrayDimension(int size);
|
||||
|
||||
/** Once a DeclSpecs instance is available, this method completes the
|
||||
initialization of the Symbol, setting its Type accordingly.
|
||||
@@ -143,51 +134,21 @@ public:
|
||||
void InitFromDeclSpecs(DeclSpecs *ds);
|
||||
|
||||
/** Get the actual type of the combination of Declarator and the given
|
||||
DeclSpecs. If an explicit base type is provided, the declarator is
|
||||
applied to that type; otherwise the base type from the DeclSpecs is
|
||||
used. */
|
||||
DeclSpecs */
|
||||
const Type *GetType(DeclSpecs *ds) const;
|
||||
const Type *GetType(const Type *base, DeclSpecs *ds) const;
|
||||
|
||||
/** Returns the symbol corresponding to the function declared by this
|
||||
declarator and symbols for its arguments in *args. */
|
||||
Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
|
||||
|
||||
/** Returns the symbol associated with the declarator. */
|
||||
Symbol *GetSymbol() const;
|
||||
|
||||
void Print() const;
|
||||
|
||||
/** Position of the declarator in the source program. */
|
||||
const SourcePos pos;
|
||||
|
||||
/** The kind of this declarator; complex declarations are assembled as
|
||||
a hierarchy of Declarators. (For example, a pointer to an int
|
||||
would have a root declarator with kind DK_POINTER and with the
|
||||
Declarator::child member pointing to a DK_BASE declarator for the
|
||||
int). */
|
||||
const DeclaratorKind kind;
|
||||
|
||||
/** Child pointer if needed; this can only be non-NULL if the
|
||||
declarator's kind isn't DK_BASE. */
|
||||
Declarator *child;
|
||||
|
||||
/** Type qualifiers provided with the declarator. */
|
||||
int typeQualifiers;
|
||||
|
||||
/** For array declarators, this gives the declared size of the array.
|
||||
Unsized arrays have arraySize == 0. */
|
||||
int arraySize;
|
||||
|
||||
/** Symbol associated with the declarator. */
|
||||
Symbol *sym;
|
||||
|
||||
/** If this declarator includes an array specification, the sizes of
|
||||
the array dimensions are represented here.
|
||||
*/
|
||||
std::vector<int> arraySize;
|
||||
/** Initialization expression for the variable. May be NULL. */
|
||||
Expr *initExpr;
|
||||
|
||||
/** For function declarations, this holds the Declaration *s for the
|
||||
funciton's parameters. */
|
||||
std::vector<Declaration *> functionParams;
|
||||
bool isFunction;
|
||||
std::vector<Declaration *> *functionArgs;
|
||||
};
|
||||
|
||||
|
||||
@@ -196,18 +157,27 @@ public:
|
||||
*/
|
||||
class Declaration {
|
||||
public:
|
||||
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
|
||||
Declaration(DeclSpecs *ds, Declarator *d);
|
||||
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL) {
|
||||
declSpecs = ds;
|
||||
if (dlist != NULL)
|
||||
declarators = *dlist;
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i)
|
||||
if (declarators[i] != NULL)
|
||||
declarators[i]->InitFromDeclSpecs(declSpecs);
|
||||
}
|
||||
Declaration(DeclSpecs *ds, Declarator *d) {
|
||||
declSpecs = ds;
|
||||
if (d) {
|
||||
d->InitFromDeclSpecs(ds);
|
||||
declarators.push_back(d);
|
||||
}
|
||||
}
|
||||
|
||||
/** Adds the symbols for the variables in the declaration to the symbol
|
||||
table. */
|
||||
void AddSymbols(SymbolTable *st) const;
|
||||
void Print() const;
|
||||
|
||||
/** This method walks through all of the Declarators in a declaration
|
||||
and returns a fully-initialized Symbol and (possibly) and
|
||||
initialization expression for each one. (This allows the rest of
|
||||
the system to not have to worry about the mess of the general
|
||||
Declarator representation.) */
|
||||
std::vector<VariableDeclaration> GetVariableDeclarations() const;
|
||||
|
||||
DeclSpecs *declSpecs;
|
||||
std::vector<Declarator *> declarators;
|
||||
};
|
||||
|
||||
@@ -1,85 +1,3 @@
|
||||
=== v1.1.0 === (5 December 2011)
|
||||
|
||||
This is a major new release of the compiler, with significant additions to
|
||||
language functionality and capabilities. It includes a number of small
|
||||
language syntax changes that will require modification of existing
|
||||
programs. These changes should generally be straightforward and all are
|
||||
steps toward eliminating parts of ispc syntax that are incompatible with
|
||||
C/C++. See
|
||||
http://ispc.github.com/ispc.html#updating-ispc-programs-for-changes-in-ispc-1-1
|
||||
for more information about these changes.
|
||||
|
||||
ispc now fully supports pointers, including pointer arithmetic, implicit
|
||||
conversions of arrays to pointers, and all of the other capabilities of
|
||||
pointers in C. See http://ispc.github.com/ispc.html#pointer-types for more
|
||||
information about pointers in ispc and
|
||||
http://ispc.github.com/ispc.html#function-pointer-types for information
|
||||
about function pointers in ispc.
|
||||
|
||||
Reference types are now declared with C++ syntax (e.g. "const float &foo").
|
||||
|
||||
ispc now supports 64-bit addressing. For performance reasons, this
|
||||
capability is disabled by default (even on 64-bit targets), but can be
|
||||
enabled with a command-line flag:
|
||||
http://ispc.github.com/ispc.html#selecting-32-or-64-bit-addressing.
|
||||
|
||||
This release features new parallel "foreach" statements, which make it
|
||||
easier in many instances to map program instances to data for data-parallel
|
||||
computation than the programIndex/programCount mechanism:
|
||||
http://ispc.github.com/ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled.
|
||||
|
||||
Finally, all of the system's documentation has been significantly revised.
|
||||
The documentation of ispc's parallel execution model has been rewritten:
|
||||
http://ispc.github.com/ispc.html#the-ispc-parallel-execution-model, and
|
||||
there is now a more specific discussion of similarities and differences
|
||||
between ispc and C/C++:
|
||||
http://ispc.github.com/ispc.html#relationship-to-the-c-programming-language.
|
||||
There is now a separate FAQ (http://ispc.github.com/faq.html), and a
|
||||
Performance Guide (http://ispc.github.com/perfguide.html).
|
||||
|
||||
=== v1.0.12 === (20 October 2011)
|
||||
|
||||
This release includes a new "double-pumped" 8-wide target for SSE2,
|
||||
"sse2-x2". Like the sse4-x2 and avx-x2 targets, this target may deliver
|
||||
higher performance for some workloads than the regular sse2 target. (For
|
||||
other workloads, it may be slower.)
|
||||
|
||||
The ispc language now includes an "assert()" statement. See
|
||||
http://ispc.github.com/ispc.html#assertions for more information.
|
||||
|
||||
The compiler now sets a preprocessor #define based on the target ISA; for
|
||||
example, ISPC_TARGET_SSE4 is defined for the sse4 targets, and so forth.
|
||||
|
||||
The standard library now provides high-performance routines for converting
|
||||
between some "array of structures" and "structure of arrays" formats.
|
||||
See
|
||||
http://ispc.github.com/ispc.html#converting-between-array-of-structures-and-structure-of-arrays-layout
|
||||
for more information.
|
||||
|
||||
Inline functions now have static linkage.
|
||||
|
||||
A number of improvements have been made to the optimization passes that
|
||||
detect when gathers and scatters can be transformed into vector stores and
|
||||
loads, respectively. In particular, these passes now handle variables that
|
||||
are used as loop induction variables much better.
|
||||
|
||||
=== v1.0.11 === (6 October 2011)
|
||||
|
||||
The main new feature in this release is support for generating code for
|
||||
multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
|
||||
select the best variant at execution time. For more information, see
|
||||
http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
|
||||
|
||||
All of the examples now take advantage of the support for multiple
|
||||
compilation targets; thus, if one has an AVX system, it's not necessary to
|
||||
recompile the examples to use the AVX target.
|
||||
|
||||
Performance of the built-in task system that is used in the examples has
|
||||
been improved.
|
||||
|
||||
Finally, the print() statement now works on OSX; it had been broken for the
|
||||
last few releases.
|
||||
|
||||
=== v1.0.10 === (30 September 2011)
|
||||
|
||||
This release features an extensive new example showing the application of
|
||||
|
||||
@@ -1,12 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
for i in ispc perfguide faq; do
|
||||
rst2html.py --template=template.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css $i.txt > $i.html
|
||||
done
|
||||
|
||||
rst2html.py --template=template-perf.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css perf.txt > perf.html
|
||||
rst2html.py ispc.txt > ispc.html
|
||||
|
||||
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
|
||||
#pdflatex ispc.tex
|
||||
|
||||
482
docs/faq.txt
482
docs/faq.txt
@@ -1,482 +0,0 @@
|
||||
=============================================================
|
||||
Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
|
||||
=============================================================
|
||||
|
||||
This document includes a number of frequently (and not frequently) asked
|
||||
questions about ispc, the Intel® SPMD Program Compiler. The source to this
|
||||
document is in the file ``docs/faq.txt`` in the ``ispc`` source
|
||||
distribution.
|
||||
|
||||
* Understanding ispc's Output
|
||||
|
||||
+ `How can I see the assembly language generated by ispc?`_
|
||||
+ `How can I have the assembly output be printed using Intel assembly syntax?`_
|
||||
+ `Why are there multiple versions of exported ispc functions in the assembly output?`_
|
||||
+ `How can I more easily see gathers and scatters in generated assembly?`_
|
||||
|
||||
* Interoperability
|
||||
|
||||
+ `How can I supply an initial execution mask in the call from the application?`_
|
||||
+ `How can I generate a single binary executable with support for multiple instruction sets?`_
|
||||
+ `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
|
||||
|
||||
* Programming Techniques
|
||||
|
||||
+ `What primitives are there for communicating between SPMD program instances?`_
|
||||
+ `How can a gang of program instances generate variable amounts of output efficiently?`_
|
||||
+ `Is it possible to use ispc for explicit vector programming?`_
|
||||
+ `How can I debug my ispc programs using Valgrind?`_
|
||||
|
||||
Understanding ispc's Output
|
||||
===========================
|
||||
|
||||
How can I see the assembly language generated by ispc?
|
||||
------------------------------------------------------
|
||||
|
||||
The ``--emit-asm`` flag causes assembly output to be generated. If the
|
||||
``-o`` command-line flag is also supplied, the assembly is stored in the
|
||||
given file, or printed to standard output if ``-`` is specified for the
|
||||
filename. For example, given the simple ``ispc`` program:
|
||||
|
||||
::
|
||||
|
||||
export uniform int foo(uniform int a, uniform int b) {
|
||||
return a+b;
|
||||
}
|
||||
|
||||
If the SSE4 target is used, then the following assembly is printed:
|
||||
|
||||
::
|
||||
|
||||
_foo:
|
||||
addl %esi, %edi
|
||||
movl %edi, %eax
|
||||
ret
|
||||
|
||||
|
||||
How can I have the assembly output be printed using Intel assembly syntax?
|
||||
--------------------------------------------------------------------------
|
||||
|
||||
The ``ispc`` compiler is currently only able to emit assembly with AT+T
|
||||
syntax, where the destination operand is the last operand after an
|
||||
instruction. If you'd prefer Intel assembly output, one option is to use
|
||||
Agner Fog's ``objconv`` tool: have ``ispc`` emit a native object file and
|
||||
then use ``objconv`` to disassemble it, specifying the assembler syntax
|
||||
that you prefer. ``objconv`` `is available for download here`_.
|
||||
|
||||
.. _is available for download here: http://www.agner.org/optimize/#objconv
|
||||
|
||||
Why are there multiple versions of exported ispc functions in the assembly output?
|
||||
----------------------------------------------------------------------------------
|
||||
|
||||
Two generations of all functions qualified with ``export`` are generated:
|
||||
one of them is for being be called by other ``ispc`` functions, and the
|
||||
other is to be called by the application. The application callable
|
||||
function has the original function's name, while the ``ispc``-callable
|
||||
function has a mangled name that encodes the types of the function's
|
||||
parameters.
|
||||
|
||||
The crucial difference between these two functions is that the
|
||||
application-callable function doesn't take a parameter encoding the current
|
||||
execution mask, while ``ispc``-callable functions have a hidden mask
|
||||
parameter. An implication of this difference is that the ``export``
|
||||
function starts with the execution mask "all on". This allows a number of
|
||||
improvements in the generated code, particularly on architectures that
|
||||
don't have support for masked load and store instructions.
|
||||
|
||||
As an example, consider this short function, which loads a vector's worth
|
||||
values from two arrays in memory, adds them, and writes the result to an
|
||||
output array.
|
||||
|
||||
::
|
||||
|
||||
export void foo(uniform float a[], uniform float b[],
|
||||
uniform float result[]) {
|
||||
float aa = a[programIndex], bb = b[programIndex];
|
||||
result[programIndex] = aa+bb;
|
||||
}
|
||||
|
||||
Here is the assembly code for the application-callable instance of the
|
||||
function.
|
||||
|
||||
::
|
||||
|
||||
_foo:
|
||||
movups (%rsi), %xmm1
|
||||
movups (%rdi), %xmm0
|
||||
addps %xmm1, %xmm0
|
||||
movups %xmm0, (%rdx)
|
||||
ret
|
||||
|
||||
|
||||
And here is the assembly code for the ``ispc``-callable instance of the
|
||||
function.
|
||||
|
||||
::
|
||||
|
||||
"_foo___uptr<Uf>uptr<Uf>uptr<Uf>":
|
||||
movmskps %xmm0, %eax
|
||||
cmpl $15, %eax
|
||||
je LBB0_3
|
||||
testl %eax, %eax
|
||||
jne LBB0_4
|
||||
ret
|
||||
LBB0_3:
|
||||
movups (%rsi), %xmm1
|
||||
movups (%rdi), %xmm0
|
||||
addps %xmm1, %xmm0
|
||||
movups %xmm0, (%rdx)
|
||||
ret
|
||||
LBB0_4:
|
||||
####
|
||||
#### Code elided; handle mixed mask case..
|
||||
####
|
||||
ret
|
||||
|
||||
There are a few things to notice in this code. First, the current program
|
||||
mask is coming in via the ``%xmm0`` register and the initial few
|
||||
instructions in the function essentially check to see if the mask is all on
|
||||
or all off. If the mask is all on, the code at the label LBB0_3 executes;
|
||||
it's the same as the code that was generated for ``_foo`` above. If the
|
||||
mask is all off, then there's nothing to be done, and the function can
|
||||
return immediately.
|
||||
|
||||
In the case of a mixed mask, a substantial amount of code is generated to
|
||||
load from and then store to only the array elements that correspond to
|
||||
program instances where the mask is on. (This code is elided below). This
|
||||
general pattern of having two-code paths for the "all on" and "mixed" mask
|
||||
cases is used in the code generated for almost all but the most simple
|
||||
functions (where the overhead of the test isn't worthwhile.)
|
||||
|
||||
How can I more easily see gathers and scatters in generated assembly?
|
||||
---------------------------------------------------------------------
|
||||
|
||||
Because CPU vector ISAs don't have native gather and scatter instructions,
|
||||
these memory operations are turned into sequences of a series of
|
||||
instructions in the code that ``ispc`` generates. In some cases, it can be
|
||||
useful to see where gathers and scatters actually happen in code; there is
|
||||
an otherwise undocumented command-line flag that provides this information.
|
||||
|
||||
Consider this simple program:
|
||||
|
||||
::
|
||||
|
||||
void set(uniform int a[], int value, int index) {
|
||||
a[index] = value;
|
||||
}
|
||||
|
||||
When compiled normally to the SSE4 target, this program generates this
|
||||
extensive code sequence, which makes it more difficult to see what the
|
||||
program is actually doing.
|
||||
|
||||
::
|
||||
|
||||
"_set___uptr<Ui>ii":
|
||||
pmulld LCPI0_0(%rip), %xmm1
|
||||
movmskps %xmm2, %eax
|
||||
testb $1, %al
|
||||
je LBB0_2
|
||||
movd %xmm1, %ecx
|
||||
movd %xmm0, (%rcx,%rdi)
|
||||
LBB0_2:
|
||||
testb $2, %al
|
||||
je LBB0_4
|
||||
pextrd $1, %xmm1, %ecx
|
||||
pextrd $1, %xmm0, (%rcx,%rdi)
|
||||
LBB0_4:
|
||||
testb $4, %al
|
||||
je LBB0_6
|
||||
pextrd $2, %xmm1, %ecx
|
||||
pextrd $2, %xmm0, (%rcx,%rdi)
|
||||
LBB0_6:
|
||||
testb $8, %al
|
||||
je LBB0_8
|
||||
pextrd $3, %xmm1, %eax
|
||||
pextrd $3, %xmm0, (%rax,%rdi)
|
||||
LBB0_8:
|
||||
ret
|
||||
|
||||
If this program is compiled with the
|
||||
``--opt=disable-handle-pseudo-memory-ops`` command-line flag, then the
|
||||
scatter is left as an unresolved function call. The resulting program
|
||||
won't link without unresolved symbols, but the assembly output is much
|
||||
easier to understand:
|
||||
|
||||
::
|
||||
|
||||
"_set___uptr<Ui>ii":
|
||||
movaps %xmm0, %xmm3
|
||||
pmulld LCPI0_0(%rip), %xmm1
|
||||
movdqa %xmm1, %xmm0
|
||||
movaps %xmm3, %xmm1
|
||||
jmp ___pseudo_scatter_base_offsets32_32 ## TAILCALL
|
||||
|
||||
|
||||
Interoperability
|
||||
================
|
||||
|
||||
How can I supply an initial execution mask in the call from the application?
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
Recall that when execution transitions from the application code to an
|
||||
``ispc`` function, all of the program instances are initially executing.
|
||||
In some cases, it may desired that only some of them are running, based on
|
||||
a data-dependent condition computed in the application program. This
|
||||
situation can easily be handled via an additional parameter from the
|
||||
application.
|
||||
|
||||
As a simple example, consider a case where the application code has an
|
||||
array of ``float`` values and we'd like the ``ispc`` code to update
|
||||
just specific values in that array, where which of those values to be
|
||||
updated has been determined by the application. In C++ code, we might
|
||||
have:
|
||||
|
||||
::
|
||||
|
||||
int count = ...;
|
||||
float *array = new float[count];
|
||||
bool *shouldUpdate = new bool[count];
|
||||
// initialize array and shouldUpdate
|
||||
ispc_func(array, shouldUpdate, count);
|
||||
|
||||
Then, the ``ispc`` code could process this update as:
|
||||
|
||||
::
|
||||
|
||||
export void ispc_func(uniform float array[], uniform bool update[],
|
||||
uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
cif (update[i] == true)
|
||||
// update array[i+programIndex]...
|
||||
}
|
||||
}
|
||||
|
||||
(In this case a "coherent" if statement is likely to be worthwhile if the
|
||||
``update`` array will tend to have sections that are either all-true or
|
||||
all-false.)
|
||||
|
||||
How can I generate a single binary executable with support for multiple instruction sets?
|
||||
-----------------------------------------------------------------------------------------
|
||||
|
||||
``ispc`` can also generate output that supports multiple target instruction
|
||||
sets, also generating code that chooses the most appropriate one at runtime
|
||||
if multiple targets are specified with the ``--target`` command-line
|
||||
argument.
|
||||
|
||||
For example, if you run the command:
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
|
||||
|
||||
Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
|
||||
``foo_avx.o``, and ``foo.o``.[#]_ Link all of these into your executable, and
|
||||
when you call a function in ``foo.ispc`` from your application code,
|
||||
``ispc`` will determine which instruction sets are supported by the CPU the
|
||||
code is running on and will call the most appropraite version of the
|
||||
function available.
|
||||
|
||||
.. [#] Similarly, if you choose to generate assembly langauage output or
|
||||
LLVM bitcode output, multiple versions of those files will be created.
|
||||
|
||||
In general, the version of the function that runs will be the one in the
|
||||
most general instruction set that is supported by the system. If you only
|
||||
compile SSE2 and SSE4 variants and run on a system that supports AVX, for
|
||||
example, then the SSE4 variant will be executed. If the system doesn't
|
||||
is not able to run any of the available variants of the function (for
|
||||
example, trying to run a function that only has SSE4 and AVX variants on a
|
||||
system that only supports SSE2), then the standard library ``abort()``
|
||||
function will be called.
|
||||
|
||||
One subtlety is that all non-static global variables (if any) must have the
|
||||
same size and layout with all of the targets used. For example, if you
|
||||
have the global variables:
|
||||
|
||||
::
|
||||
|
||||
uniform int foo[2*programCount];
|
||||
int bar;
|
||||
|
||||
and compile to both SSE2 and AVX targets, both of these variables will have
|
||||
different sizes (the first due to program count having the value 4 for SSE2
|
||||
and 8 for AVX, and the second due to ``varying`` types having different
|
||||
numbers of elements with the two targets--essentially the same issue as the
|
||||
first.) ``ispc`` issues an error in this case.
|
||||
|
||||
|
||||
How can I determine at run-time which vector instruction set's instructions were selected to execute?
|
||||
-----------------------------------------------------------------------------------------------------
|
||||
|
||||
``ispc`` doesn't provide any API that allows querying which vector ISA's
|
||||
instructions are running when multi-target compilation was used. However,
|
||||
this can be solved in "user space" by writing a small helper function.
|
||||
Specifically, if you implement a function like this
|
||||
|
||||
::
|
||||
|
||||
export uniform int isa() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
return 0;
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
return 1;
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
return 2;
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
And then call it from your application code at runtime, it will return 0,
|
||||
1, or 2, depending on which target's instructions are running.
|
||||
|
||||
The way this works is a little surprising, but it's a useful trick. Of
|
||||
course the preprocessor ``#if`` checks are all compile-time only
|
||||
operations. What's actually happening is that the function is compiled
|
||||
multiple times, once for each target, with the appropriate ``ISPC_TARGET``
|
||||
preprocessor symbol set. Then, a small dispatch function is generated for
|
||||
the application to actually call. This dispatch function in turn calls the
|
||||
appropriate version of the function based on the CPU of the system it's
|
||||
executing on, which in turn returns the appropriate value.
|
||||
|
||||
In a similar fashion, it's possible to find out at run-time the value of
|
||||
``programCount`` for the target that's actually being used.
|
||||
|
||||
::
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
Programming Techniques
|
||||
======================
|
||||
|
||||
What primitives are there for communicating between SPMD program instances?
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
|
||||
routines provide a variety of mechanisms for the running program instances
|
||||
to communicate values to each other during execution. Note that there's no
|
||||
need to synchronize the program instances before communicating between
|
||||
them, due to the synchronized execution model of gangs of program instances
|
||||
in ``ispc``.
|
||||
|
||||
How can a gang of program instances generate variable amounts of output efficiently?
|
||||
------------------------------------------------------------------------------------
|
||||
|
||||
It's not unusual to have a gang of program instances where each program
|
||||
instance generates a variable amount of output (perhaps some generate no
|
||||
output, some generate one output value, some generate many output values
|
||||
and so forth), and where one would like to have the output densely packed
|
||||
in an output array. The ``exclusive_scan_add()`` function from the
|
||||
standard library is quite useful in this situation.
|
||||
|
||||
Consider the following function:
|
||||
|
||||
::
|
||||
|
||||
uniform int func(uniform float outArray[], ...) {
|
||||
int numOut = ...; // figure out how many to be output
|
||||
float outLocal[MAX_OUT]; // staging area
|
||||
|
||||
// each program instance in the gang puts its results in
|
||||
// outLocal[0], ..., outLocal[numOut-1]
|
||||
|
||||
int startOffset = exclusive_scan_add(numOut);
|
||||
for (int i = 0; i < numOut; ++i)
|
||||
outArray[startOffset + i] = outLocal[i];
|
||||
return reduce_add(numOut);
|
||||
}
|
||||
|
||||
Here, each program instance has computed a number, ``numOut``, of values to
|
||||
output, and has stored them in the ``outLocal`` array. Assume that four
|
||||
program instances are running and that the first one wants to output one
|
||||
value, the second two values, and the third and fourth three values each.
|
||||
In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
|
||||
to the four program instances, respectively.
|
||||
|
||||
The first program instance will then write its one result to
|
||||
``outArray[0]``, the second will write its two values to ``outArray[1]``
|
||||
and ``outArray[2]``, and so forth. The ``reduce_add()`` call at the end
|
||||
returns the total number of values that all of the program instances have
|
||||
written to the array.
|
||||
|
||||
FIXME: add discussion of foreach_active as an option here once that's in
|
||||
|
||||
Is it possible to use ispc for explicit vector programming?
|
||||
-----------------------------------------------------------
|
||||
|
||||
The typical model for programming in ``ispc`` is an *implicit* parallel
|
||||
model, where one writes a program that is apparently doing scalar
|
||||
computation on values and the program is then vectorized to run in parallel
|
||||
across the SIMD lanes of a processor. However, ``ispc`` also has some
|
||||
support for explicit vector unit programming, where the vectorization is
|
||||
explicit. Some computations may be more effectively described in the
|
||||
explicit model rather than the implicit model.
|
||||
|
||||
This support is provided via ``uniform`` instances of short vectors
|
||||
Specifically, if this short program
|
||||
|
||||
::
|
||||
|
||||
export uniform float<8> madd(uniform float<8> a, uniform float<8> b,
|
||||
uniform float<8> c) {
|
||||
return a + b * c;
|
||||
}
|
||||
|
||||
is compiled with the AVX target, ``ispc`` generates the following assembly:
|
||||
|
||||
::
|
||||
|
||||
_madd:
|
||||
vmulps %ymm2, %ymm1, %ymm1
|
||||
vaddps %ymm0, %ymm1, %ymm0
|
||||
ret
|
||||
|
||||
(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
|
||||
``addps`` instructions are generated, and so forth.)
|
||||
|
||||
Note that ``ispc`` doesn't currently support control-flow based on
|
||||
``uniform`` short vector types; it is thus not possible to write code like:
|
||||
|
||||
::
|
||||
|
||||
export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
|
||||
uniform int<8> sum = 0;
|
||||
while (a++ < b)
|
||||
++sum;
|
||||
}
|
||||
|
||||
|
||||
How can I debug my ispc programs using Valgrind?
|
||||
------------------------------------------------
|
||||
|
||||
The `valgrind`_ memory checker is an extremely useful memory checker for
|
||||
Linux and OSX; it detects a range of memory errors, including accessing
|
||||
memory after it has been freed, accessing memory beyond the end of an
|
||||
array, accessing uninitialized stack variables, and so forth.
|
||||
In general, applications that use ``ispc`` code run with ``valgrind``
|
||||
without modification and ``valgrind`` will detect the same range of memory
|
||||
errors in ``ispc`` code that it does in C/C++ code.
|
||||
|
||||
.. _valgrind: http://valgrind.org
|
||||
|
||||
One issue to be aware of is that until recently, ``valgrind`` only
|
||||
supported the SSE2 vector instructions; if you are using a version of
|
||||
``valgrind`` older than the 3.7.0 release (5 November 2011), you should
|
||||
compile your ``ispc`` programs with ``--target=sse2`` before running them
|
||||
through ``valgrind``. (Note that if no target is specified, then ``ispc``
|
||||
chooses a target based on the capabilities of the system you're running
|
||||
``ispc`` on.) If you run an ``ispc`` program that uses instructions that
|
||||
``valgrind`` doesn't support, you'll see an error message like:
|
||||
|
||||
::
|
||||
|
||||
vex amd64->IR: unhandled instruction bytes: 0xC5 0xFA 0x10 0x0 0xC5 0xFA 0x11 0x84
|
||||
==46059== valgrind: Unrecognised instruction at address 0x100002707.
|
||||
|
||||
The just-released valgrind 3.7.0 adds support for the SSE4.2 instruction
|
||||
set; if you're using that version (and your system supports SSE4.2), then
|
||||
you can use ``--target=sse4`` when compiling to run with ``valgrind``.
|
||||
|
||||
Note that ``valgrind`` does not yet support programs that use the AVX
|
||||
instruction set.
|
||||
|
||||
3217
docs/ispc.txt
3217
docs/ispc.txt
File diff suppressed because it is too large
Load Diff
@@ -1,85 +0,0 @@
|
||||
===========
|
||||
Performance
|
||||
===========
|
||||
|
||||
The SPMD programming model that ``ispc`` makes it easy to harness the
|
||||
computational power available in SIMD vector units on modern CPUs, while
|
||||
its basis in C makes it easy for programmers to adopt and use
|
||||
productively. This page summarizes the performance of ``ispc`` with the
|
||||
workloads in the ``examples/`` directory of the ``ispc`` distribution.
|
||||
|
||||
These results were measured on a 4-core Apple iMac with a 4-core 3.4GHz
|
||||
Intel® Core-i7 processor using the Intel® AVX instruction set. The basis
|
||||
for comparison is a reference C++ implementation compiled with gcc 4.2.1,
|
||||
the version distributed with OS X 10.7.2. (The reference implementation is
|
||||
also included in the ``examples/`` directory.)
|
||||
|
||||
.. list-table:: Performance of ``ispc`` with a variety of the workloads
|
||||
from the ``examples/`` directory of the ``ispc`` distribution, compared
|
||||
a reference C++ implementation compiled with gcc 4.2.1.
|
||||
|
||||
* - Workload
|
||||
- ``ispc``, 1 core
|
||||
- ``ispc``, 4 cores
|
||||
* - `AOBench`_ (512 x 512 resolution)
|
||||
- 3.99x
|
||||
- 19.32x
|
||||
* - `Binomial Options`_ (128k options)
|
||||
- 7.94x
|
||||
- 33.43x
|
||||
* - `Black-Scholes Options`_ (128k options)
|
||||
- 8.45x
|
||||
- 32.48x
|
||||
* - `Deferred Shading`_ (1280p)
|
||||
- n/a
|
||||
- 23.06x
|
||||
* - `Mandelbrot Set`_
|
||||
- 6.21x
|
||||
- 19.90x
|
||||
* - `Perlin Noise Function`_
|
||||
- 5.37x
|
||||
- n/a
|
||||
* - `Ray Tracer`_ (Sponza dataset)
|
||||
- 3.99x
|
||||
- 19.32x
|
||||
* - `3D Stencil`_
|
||||
- 3.76x
|
||||
- 13.79x
|
||||
* - `Volume Rendering`_
|
||||
- 3.11x
|
||||
- 15.80x
|
||||
|
||||
|
||||
.. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
|
||||
.. _Binomial Options: https://github.com/ispc/ispc/tree/master/examples/options
|
||||
.. _Black-Scholes Options: https://github.com/ispc/ispc/tree/master/examples/options
|
||||
.. _Deferred Shading: https://github.com/ispc/ispc/tree/master/examples/deferred
|
||||
.. _Mandelbrot Set: https://github.com/ispc/ispc/tree/master/examples/mandelbrot_tasks
|
||||
.. _Ray Tracer: https://github.com/ispc/ispc/tree/master/examples/rt
|
||||
.. _Perlin Noise Function: https://github.com/ispc/ispc/tree/master/examples/noise
|
||||
.. _3D Stencil: https://github.com/ispc/ispc/tree/master/examples/stencil
|
||||
.. _Volume Rendering: https://github.com/ispc/ispc/tree/master/examples/volume_rendering
|
||||
|
||||
|
||||
The following table shows speedups for a number of the examples on a
|
||||
2.40GHz, 40-core Intel® Xeon E7-8870 system with the Intel® SSE4
|
||||
instruction set, running Microsoft Windows Server 2008 Enterprise. Here,
|
||||
the serial C/C++ baseline code was compiled with MSVC 2010.
|
||||
|
||||
.. list-table:: Performance of ``ispc`` with a variety of the workloads
|
||||
from the ``examples/`` directory of the ``ispc`` distribution, on
|
||||
system with 40 CPU cores.
|
||||
|
||||
* - Workload
|
||||
- ``ispc``, 40 cores
|
||||
* - AOBench (2048 x 2048 resolution)
|
||||
- 182.36x
|
||||
* - Binomial Options (2m options)
|
||||
- 63.85x
|
||||
* - Black-Scholes Options (2m options)
|
||||
- 83.97x
|
||||
* - Ray Tracer (Sponza dataset)
|
||||
- 195.67x
|
||||
* - Volume Rendering
|
||||
- 243.18x
|
||||
|
||||
@@ -1,714 +0,0 @@
|
||||
==============================================
|
||||
Intel® SPMD Program Compiler Performance Guide
|
||||
==============================================
|
||||
|
||||
The SPMD programming model provided by ``ispc`` naturally delivers
|
||||
excellent performance for many workloads thanks to efficient use of CPU
|
||||
SIMD vector hardware. This guide provides more details about how to get
|
||||
the most out of ``ispc`` in practice.
|
||||
|
||||
* `Key Concepts`_
|
||||
|
||||
+ `Efficient Iteration With "foreach"`_
|
||||
+ `Improving Control Flow Coherence With "foreach_tiled"`_
|
||||
+ `Using Coherent Control Flow Constructs`_
|
||||
+ `Use "uniform" Whenever Appropriate`_
|
||||
|
||||
* `Tips and Techniques`_
|
||||
|
||||
+ `Understanding Gather and Scatter`_
|
||||
+ `Avoid 64-bit Addressing Calculations When Possible`_
|
||||
+ `Avoid Computation With 8 and 16-bit Integer Types`_
|
||||
+ `Implementing Reductions Efficiently`_
|
||||
+ `Using Low-level Vector Tricks`_
|
||||
+ `The "Fast math" Option`_
|
||||
+ `"inline" Aggressively`_
|
||||
+ `Avoid The System Math Library`_
|
||||
+ `Declare Variables In The Scope Where They're Used`_
|
||||
+ `Instrumenting ISPC Programs To Understand Runtime Behavior`_
|
||||
+ `Choosing A Target Vector Width`_
|
||||
|
||||
* `Disclaimer and Legal Information`_
|
||||
|
||||
* `Optimization Notice`_
|
||||
|
||||
Key Concepts
|
||||
============
|
||||
|
||||
This section describes the four most important concepts to understand and
|
||||
keep in mind when writing high-performance ``ispc`` programs. It assumes
|
||||
good familiarity with the topics covered in the ``ispc`` `Users Guide`_.
|
||||
|
||||
.. _Users Guide: ispc.html
|
||||
|
||||
Efficient Iteration With "foreach"
|
||||
----------------------------------
|
||||
|
||||
The ``foreach`` parallel iteration construct is semantically equivalent to
|
||||
a regular ``for()`` loop, though it offers meaningful performance benefits.
|
||||
(See the `documentation on "foreach" in the Users Guide`_ for a review of
|
||||
its syntax and semantics.) As an example, consider this simple function
|
||||
that iterates over some number of elements in an array, doing computation
|
||||
on each one:
|
||||
|
||||
.. _documentation on "foreach" in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
|
||||
|
||||
::
|
||||
|
||||
export void foo(uniform int a[], uniform int count) {
|
||||
for (int i = programIndex; i < count; i += programCount) {
|
||||
// do some computation on a[i]
|
||||
}
|
||||
}
|
||||
|
||||
Depending on the specifics of the computation being performed, the code
|
||||
generated for this function could likely be improved by modifying the code
|
||||
so that the loop only goes as far through the data as is possible to pack
|
||||
an entire gang of program instances with computation each time thorugh the
|
||||
loop. Doing so enables the ``ispc`` compiler to generate more efficient
|
||||
code for cases where it knows that the execution mask is "all on". Then,
|
||||
an ``if`` statement at the end handles processing the ragged extra bits of
|
||||
data that didn't fully fill a gang.
|
||||
|
||||
::
|
||||
|
||||
export void foo(uniform int a[], uniform int count) {
|
||||
// First, just loop up to the point where all program instances
|
||||
// in the gang will be active at the loop iteration start
|
||||
uniform int countBase = count & ~(programCount-1);
|
||||
for (uniform int i = 0; i < countBase; i += programCount) {
|
||||
int index = i + programIndex;
|
||||
// do some computation on a[index]
|
||||
}
|
||||
// Now handle the ragged extra bits at the end
|
||||
if (countBase < count) {
|
||||
int index = countBase + programIndex;
|
||||
// do some computation on a[index]
|
||||
}
|
||||
}
|
||||
|
||||
While the performance of the above code will likely be better than the
|
||||
first version of the function, the loop body code has been duplicated (or
|
||||
has been forced to move into a separate utility function).
|
||||
|
||||
Using the ``foreach`` looping construct as below provides all of the
|
||||
performance benefits of the second version of this function, with the
|
||||
compactness of the first.
|
||||
|
||||
::
|
||||
|
||||
export void foo(uniform int a[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
// do some computation on a[i]
|
||||
}
|
||||
}
|
||||
|
||||
Improving Control Flow Coherence With "foreach_tiled"
|
||||
-----------------------------------------------------
|
||||
|
||||
Depending on the computation being performed, ``foreach_tiled`` may give
|
||||
better performance than ``foreach``. (See the `documentation in the Users
|
||||
Guide`_ for the syntax and semantics of ``foreach_tiled``.) Given a
|
||||
multi-dimensional iteration like:
|
||||
|
||||
.. _documentation in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
|
||||
|
||||
::
|
||||
|
||||
foreach (i = 0 ... width, j = 0 ... height) {
|
||||
// do computation on element (i,j)
|
||||
}
|
||||
|
||||
if the ``foreach`` statement is used, elements in the gang of program
|
||||
instances will be mapped to values of ``i`` and ``j`` by taking spans of
|
||||
``programCount`` elements across ``i`` with a single value of ``j``. For
|
||||
example, the ``foreach`` statement above roughly corresponds to:
|
||||
|
||||
::
|
||||
|
||||
for (uniform int j = 0; j < height; ++j)
|
||||
for (int i = 0; i < width; i += programCount) {
|
||||
// do computation
|
||||
}
|
||||
|
||||
When a multi-dimensional domain is being iterated over, ``foreach_tiled``
|
||||
statement maps program instances to data in a way that tries to select
|
||||
square n-dimensional segments of the domain. For example, on a compilation
|
||||
target with 8-wide gangs of program instances, it generates code that
|
||||
iterates over the domain the same way as the following code (though more
|
||||
efficiently):
|
||||
|
||||
::
|
||||
|
||||
for (int j = programIndex/4; j < height; j += 2)
|
||||
for (int i = programIndex%4; i < width; i += 4) {
|
||||
// do computation
|
||||
}
|
||||
|
||||
Thus, each gang of program instances operates on a 2x4 tile of the domain.
|
||||
With higher-dimensional iteration and different gang sizes, a similar
|
||||
mapping is performed--e.g. for 2D iteration with a 16-wide gang size, 4x4
|
||||
tiles are iterated over; for 4D iteration with a 8-gang, 1x2x2x2 tiles are
|
||||
processed, and so forth.
|
||||
|
||||
Performance benefit can come from using ``foreach_tiled`` in that it
|
||||
essentially optimizes for the benefit of iterating over *compact* regions
|
||||
of the domian (while ``foreach`` iterates over the domain in a way that
|
||||
generally allows linear memory access.) There are two benefits from
|
||||
processing compact regions of the domain.
|
||||
|
||||
First, it's often the case that the control flow coherence of the program
|
||||
instances in the gang is improved; if data-dependent control flow decisions
|
||||
are related to the values of the data in the domain being processed, and if
|
||||
the data values have some coherence, iterating with compact regions will
|
||||
improve control flow coherence.
|
||||
|
||||
Second, processing compact regions may mean that the data accessed by
|
||||
program instances in the gang is be more coherent, leading to performance
|
||||
benefits from better cache hit rates.
|
||||
|
||||
As a concrete example, for the ray tracer example in the ``ispc``
|
||||
distribution (in the ``examples/rt`` directory), performance is 20% better
|
||||
when the pixels are iterated over using ``foreach_tiled`` than ``foreach``,
|
||||
because more coherent regions of the scene are accessed by the set of rays
|
||||
in the gang of program instances.
|
||||
|
||||
|
||||
Using Coherent Control Flow Constructs
|
||||
--------------------------------------
|
||||
|
||||
Recall from the ``ispc`` Users Guide, in the `SPMD-on-SIMD Execution Model
|
||||
section`_ that ``if`` statements with a ``uniform`` test compile to more
|
||||
efficient code than ``if`` tests with varying tests. The coherent ``cif``
|
||||
statement can provide many benefits of ``if`` with a uniform test in the
|
||||
case where the test is actually varying.
|
||||
|
||||
.. _SPMD-on-SIMD Execution Model section: ispc.html#the-spmd-on-simd-execution-model
|
||||
|
||||
In this case, the code the compiler generates for the ``if``
|
||||
test is along the lines of the following pseudo-code:
|
||||
|
||||
::
|
||||
|
||||
bool expr = /* evaluate cif condition */
|
||||
if (all(expr)) {
|
||||
// run "true" case of if test only
|
||||
} else if (!any(expr)) {
|
||||
// run "false" case of if test only
|
||||
} else {
|
||||
// run both true and false cases, updating mask appropriately
|
||||
}
|
||||
|
||||
For ``if`` statements where the different running SPMD program instances
|
||||
don't have coherent values for the boolean ``if`` test, using ``cif``
|
||||
introduces some additional overhead from the ``all`` and ``any`` tests as
|
||||
well as the corresponding branches. For cases where the program
|
||||
instances often do compute the same boolean value, this overhead is
|
||||
worthwhile. If the control flow is in fact usually incoherent, this
|
||||
overhead only costs performance.
|
||||
|
||||
In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, and ``cdo``
|
||||
statements. These statements are semantically the same as the
|
||||
corresponding non-"c"-prefixed functions.
|
||||
|
||||
Use "uniform" Whenever Appropriate
|
||||
----------------------------------
|
||||
|
||||
For any variable that will always have the same value across all of the
|
||||
program instances in a gang, declare the variable with the ``unfiorm``
|
||||
qualifier. Doing so enables the ``ispc`` compiler to emit better code in
|
||||
many different ways.
|
||||
|
||||
As a simple example, consider a ``for`` loop that always does the same
|
||||
number of iterations:
|
||||
|
||||
::
|
||||
|
||||
for (int i = 0; i < 10; ++i)
|
||||
// do something ten times
|
||||
|
||||
If this is written with ``i`` as a ``varying`` variable, as above, there's
|
||||
additional overhead in the code generated for the loop as the compiler
|
||||
emits instructions to handle the possibilty of not all program instances
|
||||
following the same control flow path (as might be the case if the loop
|
||||
limit, 10, was itself a ``varying`` value.)
|
||||
|
||||
If the above loop is instead written with ``i`` ``uniform``, as:
|
||||
|
||||
::
|
||||
|
||||
for (uniform int i = 0; i < 10; ++i)
|
||||
// do something ten times
|
||||
|
||||
Then better code can be generated (and the loop possibly unrolled).
|
||||
|
||||
In some cases, the compiler may be able to detect simple cases like these,
|
||||
but it's always best to provide the compiler with as much help as possible
|
||||
to understand the actual form of your computation.
|
||||
|
||||
|
||||
Tips and Techniques
|
||||
===================
|
||||
|
||||
This section introduces a number of additional techniques that are worth
|
||||
keeping in mind when writing ``ispc`` programs.
|
||||
|
||||
Understanding Gather and Scatter
|
||||
--------------------------------
|
||||
|
||||
Memory reads and writes from the program instances in a gang that access
|
||||
irregular memory locations (rather than a consecutive set of locations, or
|
||||
a single location) can be relatively inefficient. As an example, consider
|
||||
the "simple" array indexing calculation below:
|
||||
|
||||
::
|
||||
|
||||
int i = ....;
|
||||
uniform float x[10] = { ... };
|
||||
float f = x[i];
|
||||
|
||||
Since the index ``i`` is a varying value, the program instances in the gang
|
||||
will in general be reading different locations in the array ``x``. Because
|
||||
current CPUs have a "gather" instruction, the ``ispc`` compiler has to
|
||||
serialize these memory reads, performing a separate memory load for each
|
||||
running program instance, packing the result into ``f``. (The analogous
|
||||
case happens for a write into ``x[i]``.)
|
||||
|
||||
In many cases, gathers like these are unavoidable; the program instances
|
||||
just need to access incoherent memory locations. However, if the array
|
||||
index ``i`` actually has the same value for all of the program instances or
|
||||
if it represents an access to a consecutive set of array locations, much
|
||||
more efficient load and store instructions can be generated instead of
|
||||
gathers and scatters, respectively.
|
||||
|
||||
In many cases, the ``ispc`` compiler is able to deduce that the memory
|
||||
locations accessed by a varying index are either all the same or are
|
||||
uniform. For example, given:
|
||||
|
||||
::
|
||||
|
||||
uniform int x = ...;
|
||||
int y = x;
|
||||
return array[y];
|
||||
|
||||
The compiler is able to determine that all of the program instances are
|
||||
loading from the same location, even though ``y`` is not a ``uniform``
|
||||
variable. In this case, the compiler will transform this load to a regular
|
||||
vector load, rather than a general gather.
|
||||
|
||||
Sometimes the running program instances will access a linear sequence of
|
||||
memory locations; this happens most frequently when array indexing is done
|
||||
based on the built-in ``programIndex`` variable. In many of these cases,
|
||||
the compiler is also able to detect this case and then do a vector load.
|
||||
For example, given:
|
||||
|
||||
::
|
||||
|
||||
for (int i = programIndex; i < count; i += programCount)
|
||||
// process array[i];
|
||||
|
||||
Regular vector loads and stores are issued for accesses to ``array[i]``.
|
||||
|
||||
Both of these cases have been ones where the compiler is able to determine
|
||||
statically that the index has the same value at compile-time. It's
|
||||
often the case that this determination can't be made at compile time, but
|
||||
this is often the case at run time. The ``reduce_equal()`` function from
|
||||
the standard library can be used in this case; it checks to see if the
|
||||
given value is the same across over all of the running program instances,
|
||||
returning true and its ``uniform`` value if so.
|
||||
|
||||
The following function shows the use of ``reduce_equal()`` to check for an
|
||||
equal index at execution time and then either do a scalar load and
|
||||
broadcast or a general gather.
|
||||
|
||||
::
|
||||
|
||||
uniform float array[..] = { ... };
|
||||
float value;
|
||||
int i = ...;
|
||||
uniform int ui;
|
||||
if (reduce_equal(i, &ui) == true)
|
||||
value = array[ui]; // scalar load + broadcast
|
||||
else
|
||||
value = array[i]; // gather
|
||||
|
||||
For a simple case like the one above, the overhead of doing the
|
||||
``reduce_equal()`` check is likely not worthwhile compared to just always
|
||||
doing a gather. In more complex cases, where a number of accesses are done
|
||||
based on the index, it can be worth doing. See the example
|
||||
``examples/volume_rendering`` in the ``ispc`` distribution for the use of
|
||||
this technique in an instance where it is beneficial to performance.
|
||||
|
||||
Avoid 64-bit Addressing Calculations When Possible
|
||||
--------------------------------------------------
|
||||
|
||||
Even when compiling to a 64-bit architecture target, ``ispc`` does many of
|
||||
the addressing calculations in 32-bit precision by default--this behavior
|
||||
can be overridden with the ``--addressing=64`` command-line argument. This
|
||||
option should only be used if it's necessary to be able to address over 4GB
|
||||
of memory in the ``ispc`` code, as it essentially doubles the cost of
|
||||
memory addressing calculations in the generated code.
|
||||
|
||||
Avoid Computation With 8 and 16-bit Integer Types
|
||||
-------------------------------------------------
|
||||
|
||||
The code generated for 8 and 16-bit integer types is generally not as
|
||||
efficient as the code generated for 32-bit integer types. It is generally
|
||||
worthwhile to use 32-bit integer types for intermediate computations, even
|
||||
if the final result will be stored in a smaller integer type.
|
||||
|
||||
Implementing Reductions Efficiently
|
||||
-----------------------------------
|
||||
|
||||
It's often necessary to compute a reduction over a data set--for example,
|
||||
one might want to add all of the values in an array, compute their minimum,
|
||||
etc. ``ispc`` provides a few capabilities that make it easy to efficiently
|
||||
compute reductions like these. However, it's important to use these
|
||||
capabilities appropriately for best results.
|
||||
|
||||
As an example, consider the task of computing the sum of all of the values
|
||||
in an array. In C code, we might have:
|
||||
|
||||
::
|
||||
|
||||
/* C implementation of a sum reduction */
|
||||
float sum(const float array[], int count) {
|
||||
float sum = 0;
|
||||
for (int i = 0; i < count; ++i)
|
||||
sum += array[i];
|
||||
return sum;
|
||||
}
|
||||
|
||||
Exactly this computation could also be expressed as a purely uniform
|
||||
computation in ``ispc``, though without any benefit from vectorization:
|
||||
|
||||
::
|
||||
|
||||
/* inefficient ispc implementation of a sum reduction */
|
||||
uniform float sum(const uniform float array[], uniform int count) {
|
||||
uniform float sum = 0;
|
||||
for (uniform int i = 0; i < count; ++i)
|
||||
sum += array[i];
|
||||
return sum;
|
||||
}
|
||||
|
||||
As a first try, one might try using the ``reduce_add()`` function from the
|
||||
``ispc`` standard library; it takes a ``varying`` value and returns the sum
|
||||
of that value across all of the active program instances.
|
||||
|
||||
::
|
||||
|
||||
/* inefficient ispc implementation of a sum reduction */
|
||||
uniform float sum(const uniform float array[], uniform int count) {
|
||||
uniform float sum = 0;
|
||||
foreach (i = 0 ... count)
|
||||
sum += reduce_add(array[i+programIndex]);
|
||||
return sum;
|
||||
}
|
||||
|
||||
This implementation loads a gang's worth of values from the array, one for
|
||||
each of the program instances, and then uses ``reduce_add()`` to reduce
|
||||
across the program instances and then update the sum. Unfortunately this
|
||||
approach loses most benefit from vectorization, as it does more work on the
|
||||
cross-program instance ``reduce_add()`` call than it saves from the vector
|
||||
load of values.
|
||||
|
||||
The most efficient approach is to do the reduction in two phases: rather
|
||||
than using a ``uniform`` variable to store the sum, we maintain a varying
|
||||
value, such that each program instance is effectively computing a local
|
||||
partial sum on the subset of array values that it has loaded from the
|
||||
array. When the loop over array elements concludes, a single call to
|
||||
``reduce_add()`` computes the final reduction across each of the program
|
||||
instances' elements of ``sum``. This approach effectively compiles to a
|
||||
single vector load and a single vector add for each loop iteration's of
|
||||
values--very efficient code in the end.
|
||||
|
||||
::
|
||||
|
||||
/* good ispc implementation of a sum reduction */
|
||||
uniform float sum(const uniform float array[], uniform int count) {
|
||||
float sum = 0;
|
||||
foreach (i = 0 ... count)
|
||||
sum += array[i+programIndex];
|
||||
return reduce_add(sum);
|
||||
}
|
||||
|
||||
Using Low-level Vector Tricks
|
||||
-----------------------------
|
||||
|
||||
Many low-level Intel® SSE and AVX coding constructs can be implemented in
|
||||
``ispc`` code. The ``ispc`` standard library functions ``intbits()`` and
|
||||
``floatbits()`` are often useful in this context. Recall that
|
||||
``intbits()`` takes a ``float`` value and returns it as an integer where
|
||||
the bits of the integer are the same as the bit representation in memory of
|
||||
the ``float``. (In other words, it does *not* perform an integer to
|
||||
floating-point conversion.) ``floatbits()``, then, performs the inverse
|
||||
computation.
|
||||
|
||||
As an example of the use of these functions, the following code efficiently
|
||||
reverses the sign of the given values.
|
||||
|
||||
::
|
||||
|
||||
float flipsign(float a) {
|
||||
unsigned int i = intbits(a);
|
||||
i ^= 0x80000000;
|
||||
return floatbits(i);
|
||||
}
|
||||
|
||||
This code compiles down to a single XOR instruction.
|
||||
|
||||
The "Fast math" Option
|
||||
----------------------
|
||||
|
||||
``ispc`` has a ``--opt=fast-math`` command-line flag that enables a number of
|
||||
optimizations that may be undesirable in code where numerical precision is
|
||||
critically important. For many graphics applications, for example, the
|
||||
approximations introduced may be acceptable, however. The following two
|
||||
optimizations are performed when ``--opt=fast-math`` is used. By default, the
|
||||
``--opt=fast-math`` flag is off.
|
||||
|
||||
* Expressions like ``x / y``, where ``y`` is a compile-time constant, are
|
||||
transformed to ``x * (1./y)``, where the inverse value of ``y`` is
|
||||
precomputed at compile time.
|
||||
|
||||
* Expressions like ``x / y``, where ``y`` is not a compile-time constant,
|
||||
are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the
|
||||
approximate reciprocal instruction from the ``ispc`` standard library.
|
||||
|
||||
|
||||
"inline" Aggressively
|
||||
---------------------
|
||||
|
||||
Inlining functions aggressively is generally beneficial for performance
|
||||
with ``ispc``. Definitely use the ``inline`` qualifier for any short
|
||||
functions (a few lines long), and experiment with it for longer functions.
|
||||
|
||||
Avoid The System Math Library
|
||||
-----------------------------
|
||||
|
||||
The default math library for transcendentals and the like that ``ispc`` has
|
||||
higher error than the system's math library, though is much more efficient
|
||||
due to being vectorized across the program instances and due to the fact
|
||||
that the functions can be inlined in the final code. (It generally has
|
||||
errors in the range of 10ulps, while the system math library generally has
|
||||
no more than 1ulp of error for transcendentals.)
|
||||
|
||||
If the ``--math-lib=system`` command-line option is used when compiling an
|
||||
``ispc`` program, then calls to the system math library will be generated
|
||||
instead. This option should only be used if the higher precision is
|
||||
absolutely required as the performance impact of using it can be
|
||||
significant.
|
||||
|
||||
Declare Variables In The Scope Where They're Used
|
||||
-------------------------------------------------
|
||||
|
||||
Performance is slightly improved by declaring variables at the same block
|
||||
scope where they are first used. For example, in code like the
|
||||
following, if the lifetime of ``foo`` is only within the scope of the
|
||||
``if`` clause, write the code like this:
|
||||
|
||||
::
|
||||
|
||||
float func() {
|
||||
....
|
||||
if (x < y) {
|
||||
float foo;
|
||||
... use foo ...
|
||||
}
|
||||
}
|
||||
|
||||
Try not to write code as:
|
||||
|
||||
::
|
||||
|
||||
float func() {
|
||||
float foo;
|
||||
....
|
||||
if (x < y) {
|
||||
... use foo ...
|
||||
}
|
||||
}
|
||||
|
||||
Doing so can reduce the amount of masked store instructions that the
|
||||
compiler needs to generate.
|
||||
|
||||
Instrumenting ISPC Programs To Understand Runtime Behavior
|
||||
----------------------------------------------------------
|
||||
|
||||
``ispc`` has an optional instrumentation feature that can help you
|
||||
understand performance issues. If a program is compiled using the
|
||||
``--instrument`` flag, the compiler emits calls to a function with the
|
||||
following signature at various points in the program (for
|
||||
example, at interesting points in the control flow, when scatters or
|
||||
gathers happen.)
|
||||
|
||||
::
|
||||
|
||||
extern "C" {
|
||||
void ISPCInstrument(const char *fn, const char *note,
|
||||
int line, int mask);
|
||||
}
|
||||
|
||||
This function is passed the file name of the ``ispc`` file running, a short
|
||||
note indicating what is happening, the line number in the source file, and
|
||||
the current mask of active program instances in the gang. You must provide an
|
||||
implementation of this function and link it in with your application.
|
||||
|
||||
For example, when the ``ispc`` program runs, this function might be called
|
||||
as follows:
|
||||
|
||||
::
|
||||
|
||||
ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
|
||||
|
||||
This call indicates that at the currently executing program has just
|
||||
entered the function defined at line 55 of the file ``foo.ispc``, with a
|
||||
mask of all lanes currently executing (assuming a four-wide gang size
|
||||
target machine).
|
||||
|
||||
For a fuller example of the utility of this functionality, see
|
||||
``examples/aobench_instrumented`` in the ``ispc`` distribution. Ths
|
||||
example includes an implementation of the ``ISPCInstrument()`` function
|
||||
that collects aggregate data about the program's execution behavior.
|
||||
|
||||
When running this example, you will want to direct to the ``ao`` executable
|
||||
to generate a low resolution image, because the instrumentation adds
|
||||
substantial execution overhead. For example:
|
||||
|
||||
::
|
||||
|
||||
% ./ao 1 32 32
|
||||
|
||||
After the ``ao`` program exits, a summary report along the following lines
|
||||
will be printed. In the first few lines, you can see how many times a few
|
||||
functions were called, and the average percentage of SIMD lanes that were
|
||||
active upon function entry.
|
||||
|
||||
::
|
||||
|
||||
ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
|
||||
ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
|
||||
ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
|
||||
ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
|
||||
ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes
|
||||
ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
|
||||
...
|
||||
|
||||
|
||||
Choosing A Target Vector Width
|
||||
------------------------------
|
||||
|
||||
By default, ``ispc`` compiles to the natural vector width of the target
|
||||
instruction set. For example, for SSE2 and SSE4, it compiles four-wide,
|
||||
and for AVX, it complies 8-wide. For some programs, higher performance may
|
||||
be seen if the program is compiled to a doubled vector width--8-wide for
|
||||
SSE and 16-wide for AVX.
|
||||
|
||||
For workloads that don't require many of registers, this method can lead to
|
||||
significantly more efficient execution thanks to greater instruction level
|
||||
parallelism and amortization of various overhead over more program
|
||||
instances. For other workloads, it may lead to a slowdown due to higher
|
||||
register pressure; trying both approaches for key kernels may be
|
||||
worthwhile.
|
||||
|
||||
This option is only available for each of the SSE2, SSE4 and AVX targets.
|
||||
It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and
|
||||
``--target=avx-x2`` options, respectively.
|
||||
|
||||
|
||||
Disclaimer and Legal Information
|
||||
================================
|
||||
|
||||
INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS.
|
||||
NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL
|
||||
PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS
|
||||
AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER,
|
||||
AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE
|
||||
OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A
|
||||
PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT
|
||||
OR OTHER INTELLECTUAL PROPERTY RIGHT.
|
||||
|
||||
UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED
|
||||
NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD
|
||||
CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR.
|
||||
|
||||
Intel may make changes to specifications and product descriptions at any time,
|
||||
without notice. Designers must not rely on the absence or characteristics of any
|
||||
features or instructions marked "reserved" or "undefined." Intel reserves these
|
||||
for future definition and shall have no responsibility whatsoever for conflicts
|
||||
or incompatibilities arising from future changes to them. The information here
|
||||
is subject to change without notice. Do not finalize a design with this
|
||||
information.
|
||||
|
||||
The products described in this document may contain design defects or errors
|
||||
known as errata which may cause the product to deviate from published
|
||||
specifications. Current characterized errata are available on request.
|
||||
|
||||
Contact your local Intel sales office or your distributor to obtain the latest
|
||||
specifications and before placing your product order.
|
||||
|
||||
Copies of documents which have an order number and are referenced in this
|
||||
document, or other Intel literature, may be obtained by calling 1-800-548-4725,
|
||||
or by visiting Intel's Web Site.
|
||||
|
||||
Intel processor numbers are not a measure of performance. Processor numbers
|
||||
differentiate features within each processor family, not across different
|
||||
processor families. See http://www.intel.com/products/processor_number for
|
||||
details.
|
||||
|
||||
BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom,
|
||||
Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile,
|
||||
i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4,
|
||||
IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside,
|
||||
Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst,
|
||||
Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep,
|
||||
Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium,
|
||||
Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside,
|
||||
skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon,
|
||||
and Xeon Inside are trademarks of Intel Corporation in the U.S. and other
|
||||
countries.
|
||||
|
||||
* Other names and brands may be claimed as the property of others.
|
||||
|
||||
Copyright(C) 2011, Intel Corporation. All rights reserved.
|
||||
|
||||
|
||||
Optimization Notice
|
||||
===================
|
||||
|
||||
Intel compilers, associated libraries and associated development tools may
|
||||
include or utilize options that optimize for instruction sets that are
|
||||
available in both Intel and non-Intel microprocessors (for example SIMD
|
||||
instruction sets), but do not optimize equally for non-Intel
|
||||
microprocessors. In addition, certain compiler options for Intel
|
||||
compilers, including some that are not specific to Intel
|
||||
micro-architecture, are reserved for Intel microprocessors. For a detailed
|
||||
description of Intel compiler options, including the instruction sets and
|
||||
specific microprocessors they implicate, please refer to the "Intel
|
||||
Compiler User and Reference Guides" under "Compiler Options." Many library
|
||||
routines that are part of Intel compiler products are more highly optimized
|
||||
for Intel microprocessors than for other microprocessors. While the
|
||||
compilers and libraries in Intel compiler products offer optimizations for
|
||||
both Intel and Intel-compatible microprocessors, depending on the options
|
||||
you select, your code and other factors, you likely will get extra
|
||||
performance on Intel microprocessors.
|
||||
|
||||
Intel compilers, associated libraries and associated development tools may
|
||||
or may not optimize to the same degree for non-Intel microprocessors for
|
||||
optimizations that are not unique to Intel microprocessors. These
|
||||
optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2),
|
||||
Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental
|
||||
Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other
|
||||
optimizations. Intel does not guarantee the availability, functionality,
|
||||
or effectiveness of any optimization on microprocessors not manufactured by
|
||||
Intel. Microprocessor-dependent optimizations in this product are intended
|
||||
for use with Intel microprocessors.
|
||||
|
||||
While Intel believes our compilers and libraries are excellent choices to
|
||||
assist in obtaining the best performance on Intel and non-Intel
|
||||
microprocessors, Intel recommends that you evaluate other compilers and
|
||||
libraries to determine which best meet your requirements. We hope to win
|
||||
your business by striving to offer the best performance of any compiler or
|
||||
library; please let us know if you find we do not.
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
%(head_prefix)s
|
||||
%(head)s
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
%(stylesheet)s
|
||||
%(body_prefix)s
|
||||
<div id="wrap">
|
||||
<div id="wrap2">
|
||||
<div id="header">
|
||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
||||
the CPU</div>
|
||||
</div>
|
||||
<div id="nav">
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li><a href="documentation.html">Documentation</a></li>
|
||||
<li id="selected"><a href="perf.html">Performance</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div id="content-wrap">
|
||||
<div id="sidebar">
|
||||
<div class="widgetspace">
|
||||
<h1>Resources</h1>
|
||||
<ul class="menu">
|
||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
||||
users mailing list</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
||||
<tt>ispc</tt> source code</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%(body_pre_docinfo)s
|
||||
%(docinfo)s
|
||||
<div id="content">
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
%(body_suffix)s
|
||||
@@ -1,65 +0,0 @@
|
||||
%(head_prefix)s
|
||||
%(head)s
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-1486404-4']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
%(stylesheet)s
|
||||
%(body_prefix)s
|
||||
<div id="wrap">
|
||||
<div id="wrap2">
|
||||
<div id="header">
|
||||
<h1 id="logo">Intel SPMD Program Compiler</h1>
|
||||
<div id="slogan">An open-source compiler for high-performance SIMD programming on
|
||||
the CPU</div>
|
||||
</div>
|
||||
<div id="nav">
|
||||
<div id="nbar">
|
||||
<ul>
|
||||
<li><a href="index.html">Overview</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="downloads.html">Downloads</a></li>
|
||||
<li id="selected"><a href="documentation.html">Documentation</a></li>
|
||||
<li><a href="perf.html">Performance</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div id="content-wrap">
|
||||
<div id="sidebar">
|
||||
<div class="widgetspace">
|
||||
<h1>Resources</h1>
|
||||
<ul class="menu">
|
||||
<li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-users/">ispc
|
||||
users mailing list</a></li>
|
||||
<li><a href="http://groups.google.com/group/ispc-dev/">ispc
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
||||
<tt>ispc</tt> source code</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%(body_pre_docinfo)s
|
||||
%(docinfo)s
|
||||
<div id="content">
|
||||
%(body)s
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
<div id="footer"> © 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue by: <a href="http://www.themebin.com/">ThemeBin</a>
|
||||
<!-- Please Do Not remove this link, thank u -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
%(body_suffix)s
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.1.0
|
||||
PROJECT_NUMBER = 1.0.10
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
@@ -585,6 +585,7 @@ INPUT = builtins.h \
|
||||
ctx.h \
|
||||
decl.h \
|
||||
expr.h \
|
||||
gatherbuf.h \
|
||||
ispc.h \
|
||||
llvmutil.h \
|
||||
module.h \
|
||||
@@ -597,6 +598,7 @@ INPUT = builtins.h \
|
||||
ctx.cpp \
|
||||
decl.cpp \
|
||||
expr.cpp \
|
||||
gatherbuf.cpp \
|
||||
ispc.cpp \
|
||||
llvmutil.cpp \
|
||||
main.cpp \
|
||||
|
||||
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
|
||||
|
||||
ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
|
||||
objs/ao_ispc_avx.o
|
||||
OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)
|
||||
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||
|
||||
default: ao
|
||||
|
||||
@@ -24,8 +20,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ ao
|
||||
|
||||
ao: dirs $(OBJS) $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/ao.o: objs/ao_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -55,6 +55,7 @@
|
||||
using namespace ispc;
|
||||
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
@@ -104,6 +105,38 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 4) {
|
||||
@@ -118,6 +151,8 @@ int main(int argc, char **argv)
|
||||
height = atoi (argv[3]);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vnormalize(vec &v) {
|
||||
static inline void vnormalize(reference vec v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v *= invlen;
|
||||
@@ -83,7 +83,8 @@ static inline void vnormalize(vec &v) {
|
||||
|
||||
|
||||
static inline void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
ray_plane_intersect(reference Isect isect, reference Ray ray,
|
||||
reference Plane plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
@@ -103,7 +104,8 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
ray_sphere_intersect(reference Isect isect, reference Ray ray,
|
||||
reference Sphere sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
@@ -125,7 +127,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
|
||||
|
||||
static inline void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
orthoBasis(reference vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
|
||||
@@ -148,8 +150,8 @@ orthoBasis(vec basis[3], vec n) {
|
||||
|
||||
|
||||
static inline float
|
||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
reference Sphere spheres[3], reference RNGState rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
vec basis[3];
|
||||
@@ -166,8 +168,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float theta = sqrt(frandom(rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrt(1.0 - theta * theta);
|
||||
@@ -203,7 +205,7 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
*/
|
||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
reference uniform float image[]) {
|
||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
@@ -211,7 +213,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, y0);
|
||||
seed_rng(rngstate, y0);
|
||||
|
||||
// Compute the mapping between the 'programCount'-wide program
|
||||
// instances running in parallel and samples in the image.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -26,18 +26,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
@@ -86,19 +86,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -107,7 +103,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -123,7 +118,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -141,7 +135,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -160,7 +153,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -173,4 +165,4 @@
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -g3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
|
||||
ISPCFLAGS=-O2 --instrument --arch=x86-64
|
||||
|
||||
default: ao
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
@@ -50,11 +51,12 @@
|
||||
#include <algorithm>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "ao_instrumented_ispc.h"
|
||||
#include "ao_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
#include "instrument.h"
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
@@ -102,6 +104,37 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
@@ -117,6 +150,8 @@ int main(int argc, char **argv)
|
||||
height = atoi (argv[3]);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vnormalize(vec &v) {
|
||||
static inline void vnormalize(reference vec v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v *= invlen;
|
||||
@@ -83,7 +83,8 @@ static inline void vnormalize(vec &v) {
|
||||
|
||||
|
||||
static inline void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
ray_plane_intersect(reference Isect isect, reference Ray ray,
|
||||
reference Plane plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
@@ -103,7 +104,8 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
ray_sphere_intersect(reference Isect isect, reference Ray ray,
|
||||
reference Sphere sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
@@ -125,7 +127,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
|
||||
|
||||
static inline void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
orthoBasis(reference vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
|
||||
@@ -148,8 +150,8 @@ orthoBasis(vec basis[3], vec n) {
|
||||
|
||||
|
||||
static inline float
|
||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
ambient_occlusion(reference Isect isect, reference Plane plane,
|
||||
reference Sphere spheres[3], reference RNGState rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
vec basis[3];
|
||||
@@ -166,8 +168,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float theta = sqrt(frandom(rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrt(1.0 - theta * theta);
|
||||
@@ -201,9 +203,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||
of width w and height h.
|
||||
*/
|
||||
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
|
||||
uniform int nsubsamples, reference uniform float image[]) {
|
||||
static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
static Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
@@ -211,7 +212,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, y0);
|
||||
seed_rng(rngstate, y0);
|
||||
|
||||
// Compute the mapping between the 'programCount'-wide program
|
||||
// instances running in parallel and samples in the image.
|
||||
@@ -230,9 +231,6 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
// direction we do per iteration and ny the number in y.
|
||||
uniform int nx = 1, ny = 1;
|
||||
|
||||
// FIXME: We actually need ny to be 1 regardless of the decomposition,
|
||||
// since the task decomposition is one scanline high.
|
||||
|
||||
if (programCount == 8) {
|
||||
// Do two pixels at once in the x direction
|
||||
nx = 2;
|
||||
@@ -241,21 +239,19 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
++du;
|
||||
}
|
||||
else if (programCount == 16) {
|
||||
nx = 4;
|
||||
ny = 1;
|
||||
if (programIndex >= 4 && programIndex < 8)
|
||||
// Two at once in both x and y
|
||||
nx = ny = 2;
|
||||
if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
|
||||
++du;
|
||||
if (programIndex >= 8 && programIndex < 12)
|
||||
du += 2;
|
||||
if (programIndex >= 12)
|
||||
du += 3;
|
||||
if (programIndex >= 8)
|
||||
++dv;
|
||||
}
|
||||
|
||||
// Now loop over all of the pixels, stepping in x and y as calculated
|
||||
// above. (Assumes that ny divides y and nx divides x...)
|
||||
for (uniform int y = y0; y < y1; y += ny) {
|
||||
for (uniform int x = 0; x < w; x += nx) {
|
||||
// Figure out x,y pixel in NDC
|
||||
// Figur out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
@@ -297,7 +293,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
|
||||
// offset to the first pixel in the image
|
||||
uniform int offset = 3 * (y * w + x);
|
||||
for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
|
||||
for (uniform int p = 0; p < programCount; p += 4, ++offset) {
|
||||
// Get the four sample values for this pixel
|
||||
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
||||
retArray[p+3];
|
||||
@@ -319,15 +315,3 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
ao_scanlines(0, h, w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
static void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[]) {
|
||||
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
||||
}
|
||||
|
||||
@@ -21,23 +21,22 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ao.cpp" />
|
||||
<ClCompile Include="instrument.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
@@ -86,23 +85,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
<PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -110,8 +101,7 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -124,8 +114,7 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -140,8 +129,7 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -158,8 +146,7 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -171,4 +158,4 @@
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -31,35 +31,36 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file ast.cpp
|
||||
@brief
|
||||
*/
|
||||
#ifndef ISPC_CPUID_H
|
||||
#define ISPC_CPUID_H 1
|
||||
|
||||
#include "ast.h"
|
||||
#include "func.h"
|
||||
#include "sym.h"
|
||||
#ifdef _MSC_VER
|
||||
// Provides a __cpuid() function with same signature as below
|
||||
#include <intrin.h>
|
||||
#else
|
||||
static void __cpuid(int info[4], int infoType) {
|
||||
__asm__ __volatile__ ("cpuid"
|
||||
: "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
: "0" (infoType));
|
||||
}
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// ASTNode
|
||||
|
||||
ASTNode::~ASTNode() {
|
||||
inline bool CPUSupportsSSE2() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[3] & (1 << 26)) != 0;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// AST
|
||||
|
||||
void
|
||||
AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
|
||||
if (sym == NULL)
|
||||
return;
|
||||
functions.push_back(new Function(sym, args, code));
|
||||
inline bool CPUSupportsSSE4() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 19)) != 0;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
AST::GenerateIR() {
|
||||
for (unsigned int i = 0; i < functions.size(); ++i)
|
||||
functions[i]->GenerateIR();
|
||||
inline bool CPUSupportsAVX() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 28)) != 0;
|
||||
}
|
||||
|
||||
#endif // ISPC_CPUID_H
|
||||
@@ -1,18 +1,22 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_CXX=../tasks_pthreads.cpp
|
||||
TASK_LIB=-lpthread
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=../tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast
|
||||
|
||||
OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
|
||||
objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
|
||||
objs/dynamic_c.o objs/dynamic_cilk.o
|
||||
OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o
|
||||
|
||||
default: deferred_shading
|
||||
|
||||
@@ -34,5 +38,5 @@ objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
@@ -118,7 +118,6 @@ Framebuffer::clear() {
|
||||
memset(b, 0, nPixels);
|
||||
}
|
||||
|
||||
|
||||
InputData *
|
||||
CreateInputDataFromFile(const char *path) {
|
||||
FILE *in = fopen(path, "rb");
|
||||
@@ -178,7 +177,8 @@ CreateInputDataFromFile(const char *path) {
|
||||
}
|
||||
|
||||
|
||||
void DeleteInputData(InputData *input) {
|
||||
void DeleteInputData(InputData *input)
|
||||
{
|
||||
lAlignedFree(input->chunk);
|
||||
}
|
||||
|
||||
|
||||
@@ -64,19 +64,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -85,7 +81,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -101,7 +96,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -119,7 +113,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -138,7 +131,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -153,23 +145,23 @@
|
||||
<ClCompile Include="dynamic_c.cpp" />
|
||||
<ClCompile Include="dynamic_cilk.cpp" />
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
<ClCompile Include="../tasks_concrt.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="kernels.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -60,7 +60,7 @@
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
@@ -141,10 +141,12 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
||||
{
|
||||
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
float minZ, maxZ;
|
||||
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
|
||||
cameraNear, cameraFar, &minZ, &maxZ);
|
||||
ComputeZBounds(
|
||||
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
&minZ, &maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
@@ -280,8 +282,8 @@ void InitDynamicC(InputData *input) {
|
||||
}
|
||||
|
||||
|
||||
/* We're going to split a tile into 4 sub-tiles. This function
|
||||
reclassifies the tile's lights with respect to the sub-tiles. */
|
||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||
// should be able to handle programCount-sized load/stores.
|
||||
static void
|
||||
SplitTileMinMax(
|
||||
int tileMidX, int tileMidY,
|
||||
@@ -337,7 +339,7 @@ SplitTileMinMax(
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again against subtile z bounds
|
||||
// Test lights again subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
@@ -412,8 +414,7 @@ Float32ToUnorm8(float f) {
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
half_to_float_fast(uint16_t h) {
|
||||
static inline float half_to_float_fast(uint16_t h) {
|
||||
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
||||
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
||||
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __cilk
|
||||
#ifdef __cilkplusplus
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
@@ -60,7 +60,7 @@
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
@@ -395,4 +395,4 @@ DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __cilk
|
||||
#endif // __cilkplusplus
|
||||
|
||||
@@ -35,22 +35,22 @@
|
||||
|
||||
struct InputDataArrays
|
||||
{
|
||||
uniform float * uniform zBuffer;
|
||||
uniform unsigned int16 * uniform normalEncoded_x; // half float
|
||||
uniform unsigned int16 * uniform normalEncoded_y; // half float
|
||||
uniform unsigned int16 * uniform specularAmount; // half float
|
||||
uniform unsigned int16 * uniform specularPower; // half float
|
||||
uniform unsigned int8 * uniform albedo_x; // unorm8
|
||||
uniform unsigned int8 * uniform albedo_y; // unorm8
|
||||
uniform unsigned int8 * uniform albedo_z; // unorm8
|
||||
uniform float * uniform lightPositionView_x;
|
||||
uniform float * uniform lightPositionView_y;
|
||||
uniform float * uniform lightPositionView_z;
|
||||
uniform float * uniform lightAttenuationBegin;
|
||||
uniform float * uniform lightColor_x;
|
||||
uniform float * uniform lightColor_y;
|
||||
uniform float * uniform lightColor_z;
|
||||
uniform float * uniform lightAttenuationEnd;
|
||||
uniform float zBuffer[];
|
||||
uniform unsigned int16 normalEncoded_x[]; // half float
|
||||
uniform unsigned int16 normalEncoded_y[]; // half float
|
||||
uniform unsigned int16 specularAmount[]; // half float
|
||||
uniform unsigned int16 specularPower[]; // half float
|
||||
uniform unsigned int8 albedo_x[]; // unorm8
|
||||
uniform unsigned int8 albedo_y[]; // unorm8
|
||||
uniform unsigned int8 albedo_z[]; // unorm8
|
||||
uniform float lightPositionView_x[];
|
||||
uniform float lightPositionView_y[];
|
||||
uniform float lightPositionView_z[];
|
||||
uniform float lightAttenuationBegin[];
|
||||
uniform float lightColor_x[];
|
||||
uniform float lightColor_y[];
|
||||
uniform float lightColor_z[];
|
||||
uniform float lightAttenuationEnd[];
|
||||
};
|
||||
|
||||
struct InputHeader
|
||||
@@ -66,6 +66,8 @@ struct InputHeader
|
||||
uniform int32 inputDataArrayOffsets[idaNum];
|
||||
};
|
||||
|
||||
export void foo(reference InputHeader h) { }
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Common utility routines
|
||||
@@ -77,7 +79,8 @@ dot3(float x, float y, float z, float a, float b, float c) {
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||
normalize3(float x, float y, float z, reference float ox,
|
||||
reference float oy, reference float oz) {
|
||||
float n = rsqrt(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
@@ -97,6 +100,7 @@ Float32ToUnorm8(float f) {
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
static void
|
||||
ComputeZBounds(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
@@ -108,17 +112,17 @@ ComputeZBounds(
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
uniform float &minZ,
|
||||
uniform float &maxZ
|
||||
reference uniform float minZ,
|
||||
reference uniform float maxZ
|
||||
)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[y * gBufferWidth + x];
|
||||
float z = zBuffer[(y * gBufferWidth + x) + programIndex];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
@@ -134,6 +138,8 @@ ComputeZBounds(
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
// numLights must currently be a multiple of programCount (SIMD size)
|
||||
export uniform int32
|
||||
IntersectLightsWithTileMinMax(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
@@ -152,7 +158,7 @@ IntersectLightsWithTileMinMax(
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
uniform int32 tileLightIndices[]
|
||||
reference uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
@@ -194,7 +200,9 @@ IntersectLightsWithTileMinMax(
|
||||
|
||||
uniform int32 tileNumLights = 0;
|
||||
|
||||
foreach (lightIndex = 0 ... numLights) {
|
||||
for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights;
|
||||
baseLightIndex += programCount) {
|
||||
int32 lightIndex = baseLightIndex + programIndex;
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
@@ -209,31 +217,32 @@ IntersectLightsWithTileMinMax(
|
||||
// don't actually need to mask the rest of this function - this is
|
||||
// just a greedy early-out. Could also structure all of this as
|
||||
// nested if() statements, but this a bit easier to read
|
||||
if (any(inFrustum)) {
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
if (!any(inFrustum))
|
||||
continue;
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
cif (inFrustum) {
|
||||
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
|
||||
lightIndex);
|
||||
}
|
||||
// Pack and store intersecting lights
|
||||
cif (inFrustum) {
|
||||
tileNumLights += packed_store_active(tileLightIndices, tileNumLights,
|
||||
lightIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -241,6 +250,8 @@ IntersectLightsWithTileMinMax(
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
// numLights must currently be a multiple of programCount (SIMD size)
|
||||
static uniform int32
|
||||
IntersectLightsWithTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
@@ -259,7 +270,7 @@ IntersectLightsWithTile(
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
uniform int32 tileLightIndices[]
|
||||
reference uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float minZ, maxZ;
|
||||
@@ -278,31 +289,32 @@ IntersectLightsWithTile(
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
export void
|
||||
ShadeTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
uniform InputDataArrays &inputData,
|
||||
reference uniform InputDataArrays inputData,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
// Light list
|
||||
uniform int32 tileLightIndices[],
|
||||
reference uniform int32 tileLightIndices[],
|
||||
uniform int32 tileNumLights,
|
||||
// UI
|
||||
uniform bool visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
int32 framebufferIndex = (y * gBufferWidth + x);
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
@@ -315,8 +327,9 @@ ShadeTile(
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
int32 gBufferOffset = y * gBufferWidth + x;
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
|
||||
int32 gBufferOffset = gBufferOffsetBase + programIndex;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
@@ -326,7 +339,7 @@ ShadeTile(
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
float positionScreen_x = (0.5f + (float)(x + programIndex)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
@@ -466,21 +479,24 @@ ShadeTile(
|
||||
// Static decomposition
|
||||
|
||||
task void
|
||||
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
uniform InputHeader &inputHeader,
|
||||
uniform InputDataArrays &inputData,
|
||||
RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
|
||||
reference uniform InputHeader inputHeader,
|
||||
reference uniform InputDataArrays inputData,
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int32 group_y = taskIndex / num_groups_x;
|
||||
uniform int32 group_x = taskIndex % num_groups_x;
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int32 group_y = g / num_groups_x;
|
||||
uniform int32 group_x = g % num_groups_x;
|
||||
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||
|
||||
uniform int sTileNumLights = 0;
|
||||
uniform int sTileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||
|
||||
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
||||
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
||||
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||
@@ -488,9 +504,8 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||
|
||||
// Light intersection: figure out which lights illuminate this tile.
|
||||
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||
uniform int numTileLights =
|
||||
// Light intersection
|
||||
sTileNumLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight,
|
||||
@@ -503,43 +518,41 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightAttenuationEnd,
|
||||
tileLightIndices);
|
||||
sTileLightIndices);
|
||||
|
||||
// And now shade the tile, using the lights in tileLightIndices
|
||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight, inputData,
|
||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
sTileLightIndices, sTileNumLights, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
RenderStatic(uniform InputHeader &inputHeader,
|
||||
uniform InputDataArrays &inputData,
|
||||
RenderStatic(reference uniform InputHeader inputHeader,
|
||||
reference uniform InputDataArrays inputData,
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]) {
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||
uniform int num_groups = num_groups_x * num_groups_y;
|
||||
|
||||
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||
// by MIN_TILE_HEIGHT pixels.
|
||||
launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||
for (uniform int g = 0; g < num_groups; ++g)
|
||||
launch < RenderTile(g, num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Routines for dynamic decomposition path
|
||||
|
||||
// This computes the z min/max range for a whole row worth of tiles.
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
export void
|
||||
ComputeZBoundsRow(
|
||||
uniform int32 tileY,
|
||||
@@ -552,8 +565,8 @@ ComputeZBoundsRow(
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
uniform float minZArray[],
|
||||
uniform float maxZArray[]
|
||||
reference uniform float minZArray[],
|
||||
reference uniform float maxZArray[]
|
||||
)
|
||||
{
|
||||
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
@@ -570,7 +583,6 @@ ComputeZBoundsRow(
|
||||
}
|
||||
|
||||
|
||||
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
|
||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||
// should be able to handle programCount-sized load/stores.
|
||||
export void
|
||||
@@ -584,7 +596,7 @@ SplitTileMinMax(
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
uniform int32 lightIndices[],
|
||||
reference uniform int32 lightIndices[],
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
@@ -593,9 +605,9 @@ SplitTileMinMax(
|
||||
// Outputs
|
||||
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
|
||||
// indexing math ourselves
|
||||
uniform int32 subtileIndices[],
|
||||
reference uniform int32 subtileIndices[],
|
||||
uniform int32 subtileIndicesPitch,
|
||||
uniform int32 subtileNumLights[]
|
||||
reference uniform int32 subtileNumLights[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
@@ -633,7 +645,12 @@ SplitTileMinMax(
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
foreach (i = 0 ... numLights) {
|
||||
for (int32 i = programIndex; i < numLights; i += programCount) {
|
||||
// TODO: ISPC says gather required here when it actually
|
||||
// isn't... this could be fixed this by nesting an if() within a
|
||||
// uniform loop, but I'm not totally sure if that's a win
|
||||
// overall. For now we'll just eat the perf cost for cleanliness
|
||||
// since the below are real gathers anyways.
|
||||
int32 lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
@@ -676,21 +693,21 @@ SplitTileMinMax(
|
||||
// Pack and store intersecting lights
|
||||
// TODO: Experiment with a loop here instead
|
||||
cif (inFrustum[0])
|
||||
subtileLightOffset[0] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[0]],
|
||||
lightIndex);
|
||||
subtileLightOffset[0] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[0],
|
||||
lightIndex);
|
||||
cif (inFrustum[1])
|
||||
subtileLightOffset[1] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[1]],
|
||||
lightIndex);
|
||||
subtileLightOffset[1] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[1],
|
||||
lightIndex);
|
||||
cif (inFrustum[2])
|
||||
subtileLightOffset[2] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[2]],
|
||||
lightIndex);
|
||||
subtileLightOffset[2] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[2],
|
||||
lightIndex);
|
||||
cif (inFrustum[3])
|
||||
subtileLightOffset[3] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[3]],
|
||||
lightIndex);
|
||||
subtileLightOffset[3] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[3],
|
||||
lightIndex);
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc != 2) {
|
||||
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
|
||||
printf("usage: deferred_shading <input_file>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -77,9 +77,9 @@ int main(int argc, char** argv) {
|
||||
input->header.framebufferHeight);
|
||||
|
||||
InitDynamicC(input);
|
||||
#ifdef __cilk
|
||||
#ifdef __cilkplusplus
|
||||
InitDynamicCilk(input);
|
||||
#endif // __cilk
|
||||
#endif // __cilkplusplus
|
||||
|
||||
int nframes = 5;
|
||||
double ispcCycles = 1e30;
|
||||
@@ -98,21 +98,6 @@ int main(int argc, char** argv) {
|
||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilk
|
||||
double dynamicCilkCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicCilk(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||
}
|
||||
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n",
|
||||
dynamicCilkCycles);
|
||||
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||
#endif // __cilk
|
||||
|
||||
double serialCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
@@ -122,16 +107,29 @@ int main(int argc, char** argv) {
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
serialCycles = std::min(serialCycles, mcycles);
|
||||
}
|
||||
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n",
|
||||
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n",
|
||||
serialCycles);
|
||||
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilk
|
||||
#ifdef __cilkplusplus
|
||||
double dynamicCilkCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicCilk(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||
}
|
||||
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n",
|
||||
dynamicCilkCycles);
|
||||
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
||||
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
||||
#else
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
||||
#endif // __cilk
|
||||
#endif // __cilkplusplus
|
||||
|
||||
DeleteInputData(input);
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: mandelbrot
|
||||
|
||||
@@ -14,17 +14,13 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ mandelbrot
|
||||
|
||||
OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
|
||||
objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
|
||||
objs/mandelbrot_ispc.o
|
||||
|
||||
mandelbrot: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
||||
mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -67,6 +68,38 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 512;
|
||||
@@ -78,6 +111,8 @@ int main() {
|
||||
int maxIterations = 256;
|
||||
int *buf = new int[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
|
||||
@@ -51,7 +51,7 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float x1, uniform float y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations,
|
||||
uniform int output[])
|
||||
reference uniform int output[])
|
||||
{
|
||||
float dx = (x1 - x0) / width;
|
||||
float dy = (y1 - y0) / height;
|
||||
@@ -60,16 +60,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
// Note that we'll be doing programCount computations in parallel,
|
||||
// so increment i by that much. This assumes that width evenly
|
||||
// divides programCount.
|
||||
foreach (i = 0 ... width) {
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
// Figure out the position on the complex plane to compute the
|
||||
// number of iterations at. Note that the x values are
|
||||
// different across different program instances, since its
|
||||
// initializer incorporates the value of the programIndex
|
||||
// variable.
|
||||
float x = x0 + i * dx;
|
||||
float x = x0 + (programIndex + i) * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = j * width + i;
|
||||
int index = j * width + i + programIndex;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,19 +64,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -85,7 +81,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -101,7 +96,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -119,7 +113,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -138,7 +131,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -155,18 +147,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
|
||||
objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
|
||||
objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: mandelbrot
|
||||
|
||||
@@ -24,8 +20,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ mandelbrot
|
||||
|
||||
mandelbrot: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -68,6 +69,37 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
static void usage() {
|
||||
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
||||
exit(1);
|
||||
@@ -100,6 +132,8 @@ int main(int argc, char *argv[]) {
|
||||
else
|
||||
usage();
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
int maxIterations = 512;
|
||||
int *buf = new int[width*height];
|
||||
|
||||
|
||||
@@ -57,16 +57,18 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
|
||||
uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int maxIterations,
|
||||
uniform int output[]) {
|
||||
reference uniform int output[]) {
|
||||
uniform int ystart = ybase + taskIndex * span;
|
||||
uniform int yend = ystart + span;
|
||||
|
||||
foreach (yi = ystart ... yend, xi = 0 ... width) {
|
||||
float x = x0 + xi * dx;
|
||||
float y = y0 + yi * dy;
|
||||
for (uniform int j = ystart; j < yend; ++j) {
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
float x = x0 + (programIndex + i) * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = yi * width + xi;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
int index = j * width + i + programIndex;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,7 +77,7 @@ task void
|
||||
mandelbrot_chunk(uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations, uniform int output[]) {
|
||||
uniform int maxIterations, reference uniform int output[]) {
|
||||
uniform int ystart = taskIndex * (height/taskCount);
|
||||
uniform int yend = (taskIndex+1) * (height/taskCount);
|
||||
uniform int span = 1;
|
||||
@@ -89,7 +91,7 @@ export void
|
||||
mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float x1, uniform float y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations, uniform int output[]) {
|
||||
uniform int maxIterations, reference uniform int output[]) {
|
||||
uniform float dx = (x1 - x0) / width;
|
||||
uniform float dy = (y1 - y0) / height;
|
||||
|
||||
|
||||
@@ -64,19 +64,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -85,7 +81,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -101,7 +96,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -119,7 +113,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -138,7 +131,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -156,18 +148,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -2,10 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
|
||||
objs/noise_ispc_sse4.o objs/noise_ispc_avx.o
|
||||
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||
|
||||
default: noise
|
||||
|
||||
@@ -17,13 +14,13 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ noise
|
||||
|
||||
noise: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
||||
noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/noise.o: objs/noise_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "noise_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -65,6 +66,38 @@ writePPM(float *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 768;
|
||||
@@ -75,6 +108,8 @@ int main() {
|
||||
|
||||
float *buf = new float[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -64,19 +64,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -85,7 +81,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -101,7 +96,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -119,7 +113,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -138,7 +131,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -155,18 +147,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="noise.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -1,17 +1,8 @@
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -g -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
|
||||
objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
|
||||
objs/options_ispc_avx.o $(TASK_OBJ)
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: options
|
||||
|
||||
@@ -23,16 +14,13 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ options
|
||||
|
||||
options: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/options.o: objs/options_ispc.h options_defs.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -31,8 +31,6 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#define NOMINMAX
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@@ -43,6 +41,7 @@ using std::max;
|
||||
|
||||
#include "options_defs.h"
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#include "options_ispc.h"
|
||||
using namespace ispc;
|
||||
@@ -55,32 +54,49 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count);
|
||||
|
||||
static void usage() {
|
||||
printf("usage: options [--count=<num options>]\n");
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int nOptions = 128*1024;
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
float *S = new float[N_OPTIONS];
|
||||
float *X = new float[N_OPTIONS];
|
||||
float *T = new float[N_OPTIONS];
|
||||
float *r = new float[N_OPTIONS];
|
||||
float *v = new float[N_OPTIONS];
|
||||
float *result = new float[N_OPTIONS];
|
||||
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strncmp(argv[i], "--count=", 8) == 0) {
|
||||
nOptions = atoi(argv[i] + 8);
|
||||
if (nOptions <= 0) {
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float *S = new float[nOptions];
|
||||
float *X = new float[nOptions];
|
||||
float *T = new float[nOptions];
|
||||
float *r = new float[nOptions];
|
||||
float *v = new float[nOptions];
|
||||
float *result = new float[nOptions];
|
||||
|
||||
for (int i = 0; i < nOptions; ++i) {
|
||||
for (int i = 0; i < N_OPTIONS; ++i) {
|
||||
S[i] = 100; // stock price
|
||||
X[i] = 98; // option strike price
|
||||
T[i] = 2; // time (years)
|
||||
@@ -88,109 +104,61 @@ int main(int argc, char *argv[]) {
|
||||
v[i] = 5; // volatility
|
||||
}
|
||||
|
||||
double sum;
|
||||
|
||||
//
|
||||
// Binomial options pricing model, ispc implementation
|
||||
//
|
||||
double binomial_ispc = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
binomial_put_ispc(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
binomial_ispc = std::min(binomial_ispc, dt);
|
||||
}
|
||||
printf("[binomial ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_ispc, sum / nOptions);
|
||||
|
||||
//
|
||||
// Binomial options pricing model, ispc implementation, tasks
|
||||
//
|
||||
double binomial_tasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
binomial_tasks = std::min(binomial_tasks, dt);
|
||||
}
|
||||
printf("[binomial ispc, tasks]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_tasks, sum / nOptions);
|
||||
reset_and_start_timer();
|
||||
binomial_put_ispc(S, X, T, r, v, result, N_OPTIONS);
|
||||
double binomial_ispc = get_elapsed_mcycles();
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < N_OPTIONS; ++i)
|
||||
sum += result[i];
|
||||
printf("[binomial ispc]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_ispc, sum / N_OPTIONS);
|
||||
|
||||
//
|
||||
// Binomial options, serial implementation
|
||||
//
|
||||
double binomial_serial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
binomial_put_serial(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
binomial_serial = std::min(binomial_serial, dt);
|
||||
}
|
||||
reset_and_start_timer();
|
||||
binomial_put_serial(S, X, T, r, v, result, N_OPTIONS);
|
||||
double binomial_serial = get_elapsed_mcycles();
|
||||
sum = 0.f;
|
||||
for (int i = 0; i < N_OPTIONS; ++i)
|
||||
sum += result[i];
|
||||
printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
binomial_serial, sum / nOptions);
|
||||
binomial_serial, sum / N_OPTIONS);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
binomial_serial / binomial_ispc, binomial_serial / binomial_tasks);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", binomial_serial / binomial_ispc);
|
||||
|
||||
//
|
||||
// Black-Scholes options pricing model, ispc implementation, 1 thread
|
||||
// Black-Scholes options pricing model, ispc implementation
|
||||
//
|
||||
double bs_ispc = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
black_scholes_ispc(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum = 0.f;
|
||||
reset_and_start_timer();
|
||||
for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
|
||||
black_scholes_ispc(S, X, T, r, v, result, N_OPTIONS);
|
||||
for (int i = 0; i < N_OPTIONS; ++i)
|
||||
sum += result[i];
|
||||
bs_ispc = std::min(bs_ispc, dt);
|
||||
}
|
||||
printf("[black-scholes ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n",
|
||||
bs_ispc, sum / nOptions);
|
||||
|
||||
//
|
||||
// Black-Scholes options pricing model, ispc implementation, tasks
|
||||
//
|
||||
double bs_ispc_tasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
|
||||
}
|
||||
printf("[black-scholes ispc, tasks]:\t[%.3f] million cycles (avg %f)\n",
|
||||
bs_ispc_tasks, sum / nOptions);
|
||||
double bs_ispc = get_elapsed_mcycles();
|
||||
printf("[black-scholes ispc]:\t\t[%.3f] million cycles (avg %f)\n",
|
||||
bs_ispc, sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
|
||||
|
||||
//
|
||||
// Black-Scholes options pricing model, serial implementation
|
||||
//
|
||||
double bs_serial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
black_scholes_serial(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_mcycles();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum = 0.f;
|
||||
reset_and_start_timer();
|
||||
for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
|
||||
black_scholes_serial(S, X, T, r, v, result, N_OPTIONS);
|
||||
for (int i = 0; i < N_OPTIONS; ++i)
|
||||
sum += result[i];
|
||||
bs_serial = std::min(bs_serial, dt);
|
||||
}
|
||||
double bs_serial = get_elapsed_mcycles();
|
||||
printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial,
|
||||
sum / nOptions);
|
||||
sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
bs_serial / bs_ispc, bs_serial / bs_ispc_tasks);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", bs_serial / bs_ispc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -55,100 +55,49 @@ CND(float X) {
|
||||
return w;
|
||||
}
|
||||
|
||||
task void
|
||||
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int first = taskIndex * (count/taskCount);
|
||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
|
||||
foreach (i = first ... last) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
|
||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
||||
float d2 = d1 - v * sqrt(T);
|
||||
|
||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
}
|
||||
}
|
||||
|
||||
export void
|
||||
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int nTasks = max((int)64, (int)count/16384);
|
||||
launch[nTasks] < bs_task(Sa, Xa, Ta, ra, va, result, count) >;
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
float S = Sa[i + programIndex], X = Xa[i + programIndex];
|
||||
float T = Ta[i + programIndex], r = ra[i + programIndex];
|
||||
float v = va[i + programIndex];
|
||||
|
||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
||||
float d2 = d1 - v * sqrt(T);
|
||||
|
||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
binomial_put(float S, float X, float T, float r, float v) {
|
||||
float V[BINOMIAL_NUM];
|
||||
|
||||
float dt = T / BINOMIAL_NUM;
|
||||
float u = exp(v * sqrt(dt));
|
||||
float d = 1. / u;
|
||||
float disc = exp(r * dt);
|
||||
float Pu = (disc - d) / (u - d);
|
||||
|
||||
for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
|
||||
float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0., X - S * upow);
|
||||
}
|
||||
|
||||
for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
|
||||
for (uniform int k = 0; k < j; ++k)
|
||||
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
|
||||
return V[0];
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
result[i] = binomial_put(S, X, T, r, v);
|
||||
float V[BINOMIAL_NUM];
|
||||
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
float S = Sa[i + programIndex], X = Xa[i + programIndex];
|
||||
float T = Ta[i + programIndex], r = ra[i + programIndex];
|
||||
float v = va[i + programIndex];
|
||||
|
||||
float dt = T / BINOMIAL_NUM;
|
||||
float u = exp(v * sqrt(dt));
|
||||
float d = 1. / u;
|
||||
float disc = exp(r * dt);
|
||||
float Pu = (disc - d) / (u - d);
|
||||
|
||||
for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
|
||||
float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0., X - S * upow);
|
||||
}
|
||||
|
||||
for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
|
||||
for (uniform int k = 0; k < j; ++k)
|
||||
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
|
||||
|
||||
result[i + programIndex] = V[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
binomial_task(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int first = taskIndex * (count/taskCount);
|
||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
|
||||
foreach (i = first ... last) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
result[i] = binomial_put(S, X, T, r, v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int nTasks = max((int)64, (int)count/16384);
|
||||
launch[nTasks] < binomial_task(Sa, Xa, Ta, ra, va, result, count) >;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -64,19 +64,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -85,7 +81,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
@@ -102,7 +97,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
@@ -121,7 +115,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -141,7 +134,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -155,23 +147,22 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="options.cpp" />
|
||||
<ClCompile Include="options_serial.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="options.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
|
||||
@@ -35,6 +35,8 @@
|
||||
#define OPTIONS_DEFS_H 1
|
||||
|
||||
#define BINOMIAL_NUM 64
|
||||
#define N_OPTIONS 65536
|
||||
#define N_BLACK_SCHOLES_ROUNDS 20
|
||||
|
||||
|
||||
#endif // OPTIONS_DEFS_H
|
||||
|
||||
@@ -8,10 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
|
||||
|
||||
OBJS=objs/rt.o objs/rt_serial.o $(TASK_OBJ) objs/rt_ispc.o objs/rt_ispc_sse2.o \
|
||||
objs/rt_ispc_sse4.o objs/rt_ispc_avx.o
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: rt
|
||||
|
||||
@@ -23,8 +20,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ rt
|
||||
|
||||
rt: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -34,5 +31,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/rt.o: objs/rt_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "rt_ispc.h"
|
||||
|
||||
using namespace ispc;
|
||||
@@ -95,6 +96,38 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void usage() {
|
||||
fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
|
||||
exit(1);
|
||||
@@ -118,6 +151,8 @@ int main(int argc, char *argv[]) {
|
||||
if (filename == NULL)
|
||||
usage();
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
#define READ(var, n) \
|
||||
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
|
||||
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
|
||||
@@ -168,12 +203,12 @@ int main(int argc, char *argv[]) {
|
||||
// of node, the total number of int it if a leaf node, etc.
|
||||
float b[6];
|
||||
READ(b[0], 6);
|
||||
nodes[i].bounds[0][0] = b[0];
|
||||
nodes[i].bounds[0][1] = b[1];
|
||||
nodes[i].bounds[0][2] = b[2];
|
||||
nodes[i].bounds[1][0] = b[3];
|
||||
nodes[i].bounds[1][1] = b[4];
|
||||
nodes[i].bounds[1][2] = b[5];
|
||||
nodes[i].bounds[0].v[0] = b[0];
|
||||
nodes[i].bounds[0].v[1] = b[1];
|
||||
nodes[i].bounds[0].v[2] = b[2];
|
||||
nodes[i].bounds[1].v[0] = b[3];
|
||||
nodes[i].bounds[1].v[1] = b[4];
|
||||
nodes[i].bounds[1].v[2] = b[5];
|
||||
READ(nodes[i].offset, 1);
|
||||
READ(nodes[i].nPrimitives, 1);
|
||||
READ(nodes[i].splitAxis, 1);
|
||||
@@ -190,17 +225,19 @@ int main(int argc, char *argv[]) {
|
||||
READ(v[0], 9);
|
||||
float *vp = v;
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
triangles[i].p[j][0] = *vp++;
|
||||
triangles[i].p[j][1] = *vp++;
|
||||
triangles[i].p[j][2] = *vp++;
|
||||
triangles[i].p[j].v[0] = *vp++;
|
||||
triangles[i].p[j].v[1] = *vp++;
|
||||
triangles[i].p[j].v[2] = *vp++;
|
||||
}
|
||||
// And create an object id
|
||||
triangles[i].id = i+1;
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
int height = int(baseHeight * scale);
|
||||
int width = int(baseWidth * scale);
|
||||
// round image resolution up to multiple of 16 to make things easy for
|
||||
// the code that assigns pixels to ispc program instances
|
||||
int height = (int(baseHeight * scale) + 0xf) & ~0xf;
|
||||
int width = (int(baseWidth * scale) + 0xf) & ~0xf;
|
||||
|
||||
// allocate images; one to hold hit object ids, one to hold depth to
|
||||
// the first interseciton
|
||||
|
||||
@@ -43,13 +43,12 @@ struct Ray {
|
||||
};
|
||||
|
||||
struct Triangle {
|
||||
uniform float p[3][4];
|
||||
uniform float3 p[3];
|
||||
uniform int id;
|
||||
uniform int pad[3];
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
uniform float bounds[2][3];
|
||||
uniform float3 bounds[2];
|
||||
uniform unsigned int offset; // num primitives for leaf, second child for interior
|
||||
uniform unsigned int8 nPrimitives;
|
||||
uniform unsigned int8 splitAxis;
|
||||
@@ -73,7 +72,7 @@ static inline float Dot(const float3 a, const float3 b) {
|
||||
|
||||
static void generateRay(uniform const float raster2camera[4][4],
|
||||
uniform const float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
float x, float y, reference Ray ray) {
|
||||
ray.mint = 0.f;
|
||||
ray.maxt = 1e30f;
|
||||
|
||||
@@ -104,16 +103,14 @@ static void generateRay(uniform const float raster2camera[4][4],
|
||||
}
|
||||
|
||||
|
||||
static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
static inline bool BBoxIntersect(const reference uniform float3 bounds[2],
|
||||
const reference Ray ray) {
|
||||
float t0 = ray.mint, t1 = ray.maxt;
|
||||
|
||||
// Check all three axis-aligned slabs. Don't try to early out; it's
|
||||
// not worth the trouble
|
||||
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
|
||||
float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds[1] - ray.origin) * ray.invDir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
@@ -143,12 +140,9 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
|
||||
|
||||
|
||||
static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||
uniform float3 e1 = p1 - p0;
|
||||
uniform float3 e2 = p2 - p0;
|
||||
static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
|
||||
uniform float3 e1 = tri.p[1] - tri.p[0];
|
||||
uniform float3 e2 = tri.p[2] - tri.p[0];
|
||||
|
||||
float3 s1 = Cross(ray.dir, e2);
|
||||
float divisor = Dot(s1, e1);
|
||||
@@ -159,7 +153,7 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
float invDivisor = 1.f / divisor;
|
||||
|
||||
// Compute first barycentric coordinate
|
||||
float3 d = ray.origin - p0;
|
||||
float3 d = ray.origin - tri.p[0];
|
||||
float b1 = Dot(d, s1) * invDivisor;
|
||||
if (b1 < 0. || b1 > 1.)
|
||||
hit = false;
|
||||
@@ -184,7 +178,7 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
|
||||
|
||||
bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
|
||||
Ray &r) {
|
||||
reference Ray r) {
|
||||
Ray ray = r;
|
||||
bool hit = false;
|
||||
// Follow ray through BVH nodes to find primitive intersections
|
||||
@@ -244,15 +238,34 @@ static void raytrace_tile(uniform int x0, uniform int x1,
|
||||
uniform float widthScale = (float)(baseWidth) / (float)(width);
|
||||
uniform float heightScale = (float)(baseHeight) / (float)(height);
|
||||
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x*widthScale,
|
||||
y*heightScale, ray);
|
||||
BVHIntersect(nodes, triangles, ray);
|
||||
static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||
static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
|
||||
int offset = y * width + x;
|
||||
image[offset] = ray.maxt;
|
||||
id[offset] = ray.hitId;
|
||||
// The outer loops are always over blocks of 4x4 pixels
|
||||
for (uniform int y = y0; y < y1; y += 4) {
|
||||
for (uniform int x = x0; x < x1; x += 4) {
|
||||
// Now we have a block of 4x4=16 pixels to process; it will
|
||||
// take 16/programCount iterations of this loop to process
|
||||
// them.
|
||||
for (uniform int o = 0; o < 16 / programCount; ++o) {
|
||||
// Map program instances to samples in the udx/udy arrays
|
||||
// to figure out which pixel each program instance is
|
||||
// responsible for
|
||||
const float dx = udx[o * programCount + programIndex];
|
||||
const float dy = udy[o * programCount + programIndex];
|
||||
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, (x+dx)*widthScale,
|
||||
(y+dy)*heightScale, ray);
|
||||
BVHIntersect(nodes, triangles, ray);
|
||||
|
||||
int offset = (y + (int)dy) * width + (x + (int)dx);
|
||||
image[offset] = ray.maxt;
|
||||
id[offset] = ray.hitId;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -270,19 +283,19 @@ export void raytrace_ispc(uniform int width, uniform int height,
|
||||
}
|
||||
|
||||
|
||||
task void raytrace_tile_task(uniform int width, uniform int height,
|
||||
task void raytrace_tile_task(uniform int y0, uniform int y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
uniform int dx = 16, dy = 16; // must match dx, dy below
|
||||
uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
uniform int x0 = (taskIndex % xBuckets) * dx;
|
||||
uniform int x1 = min(x0 + dx, width);
|
||||
uniform int y0 = (taskIndex / xBuckets) * dy;
|
||||
uniform int y1 = min(y0 + dy, height);
|
||||
uniform int dx = 16; // must match dx below
|
||||
uniform int xTasks = (width + (dx-1)) / dx;
|
||||
uniform int x0 = (taskIndex % xTasks) * dx;
|
||||
uniform int x1 = x0 + dx;
|
||||
x1 = min(x1, width);
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
@@ -298,11 +311,11 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
uniform int dx = 16, dy = 16;
|
||||
uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
uniform int yBuckets = (height + (dy-1)) / dy;
|
||||
uniform int nTasks = xBuckets * yBuckets;
|
||||
launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles) >;
|
||||
uniform int nTasks = (width + (dx-1)) / dx;
|
||||
for (uniform int y = 0; y < height; y += dy) {
|
||||
uniform int y1 = min(y + dy, height);
|
||||
launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth,
|
||||
baseHeight, raster2camera, camera2world,
|
||||
image, id, nodes, triangles) >;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -64,19 +64,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -85,7 +81,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -101,7 +96,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -119,7 +113,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -138,7 +131,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -152,21 +144,21 @@
|
||||
<CustomBuild Include="rt.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
|
||||
@@ -75,13 +75,12 @@ struct Ray {
|
||||
// Declare these in a namespace so the mangling matches
|
||||
namespace ispc {
|
||||
struct Triangle {
|
||||
float p[3][4]; // extra float pad after each vertex
|
||||
float3 p[3];
|
||||
int32_t id;
|
||||
int32_t pad[3]; // make 16 x 32-bits
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
float bounds[2][3];
|
||||
float3 bounds[2];
|
||||
int32_t offset; // primitives for leaf, second child for interior
|
||||
uint8_t nPrimitives;
|
||||
uint8_t splitAxis;
|
||||
@@ -141,14 +140,12 @@ static void generateRay(const float raster2camera[4][4],
|
||||
}
|
||||
|
||||
|
||||
static inline bool BBoxIntersect(const float bounds[2][3],
|
||||
static inline bool BBoxIntersect(const float3 bounds[2],
|
||||
const Ray &ray) {
|
||||
float3 bounds0(bounds[0][0], bounds[0][1], bounds[0][2]);
|
||||
float3 bounds1(bounds[1][0], bounds[1][1], bounds[1][2]);
|
||||
float t0 = ray.mint, t1 = ray.maxt;
|
||||
|
||||
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
|
||||
float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds[1] - ray.origin) * ray.invDir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
@@ -179,11 +176,8 @@ static inline bool BBoxIntersect(const float bounds[2][3],
|
||||
|
||||
|
||||
inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
float3 p0(tri.p[0][0], tri.p[0][1], tri.p[0][2]);
|
||||
float3 p1(tri.p[1][0], tri.p[1][1], tri.p[1][2]);
|
||||
float3 p2(tri.p[2][0], tri.p[2][1], tri.p[2][2]);
|
||||
float3 e1 = p1 - p0;
|
||||
float3 e2 = p2 - p0;
|
||||
float3 e1 = tri.p[1] - tri.p[0];
|
||||
float3 e2 = tri.p[2] - tri.p[0];
|
||||
|
||||
float3 s1 = Cross(ray.dir, e2);
|
||||
float divisor = Dot(s1, e1);
|
||||
@@ -193,7 +187,7 @@ inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
float invDivisor = 1.f / divisor;
|
||||
|
||||
// Compute first barycentric coordinate
|
||||
float3 d = ray.origin - p0;
|
||||
float3 d = ray.origin - tri.p[0];
|
||||
float b1 = Dot(d, s1) * invDivisor;
|
||||
if (b1 < 0. || b1 > 1.)
|
||||
return false;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --arch=x86-64 --target=sse2
|
||||
ISPCFLAGS=-O2 --arch=x86-64
|
||||
|
||||
default: simple
|
||||
|
||||
|
||||
@@ -33,12 +33,47 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "../cpuid.h"
|
||||
|
||||
// Include the header file that the ispc compiler generates
|
||||
#include "simple_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
float vin[16], vout[16];
|
||||
|
||||
// Initialize input buffer
|
||||
|
||||
@@ -34,7 +34,9 @@
|
||||
|
||||
export void simple(uniform float vin[], uniform float vout[],
|
||||
uniform int count) {
|
||||
foreach (index = 0 ... count) {
|
||||
// Compute the result for 'programCount' values in parallel
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
int index = i + programIndex;
|
||||
// Load the appropriate input value for this program instance.
|
||||
float v = vin[index];
|
||||
|
||||
|
||||
@@ -25,21 +25,21 @@
|
||||
<CustomBuild Include="simple.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
@@ -88,19 +88,15 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -109,7 +105,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -123,7 +118,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -139,7 +133,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -157,7 +150,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -169,4 +161,4 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
|
||||
|
||||
OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
|
||||
objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
|
||||
objs/stencil_ispc_avx.o
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: stencil
|
||||
|
||||
@@ -24,8 +20,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ stencil
|
||||
|
||||
stencil: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/stencil.o: objs/stencil_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -42,10 +42,43 @@
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "stencil_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||
int y0, int y1, int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
@@ -67,6 +100,8 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
|
||||
|
||||
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
int Nx = 256, Ny = 256, Nz = 256;
|
||||
int width = 4;
|
||||
float *Aserial[2], *Aispc[2];
|
||||
|
||||
@@ -43,8 +43,9 @@ stencil_step(uniform int x0, uniform int x1,
|
||||
|
||||
for (uniform int z = z0; z < z1; ++z) {
|
||||
for (uniform int y = y0; y < y1; ++y) {
|
||||
foreach (x = x0 ... x1) {
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
// Assumes that (x1-x0) % programCount == 0
|
||||
for (uniform int x = x0; x < x1; x += programCount) {
|
||||
int index = (z * Nxy) + (y * Nx) + x + programIndex;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
float div = coef[0] * A_cur(0, 0, 0) +
|
||||
|
||||
@@ -64,19 +64,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -85,7 +81,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -101,7 +96,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -119,7 +113,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -138,7 +131,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -152,21 +144,21 @@
|
||||
<CustomBuild Include="stencil.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
|
||||
@@ -53,7 +53,9 @@
|
||||
#define ISPC_USE_PTHREADS
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#define ISPC_USE_GCD
|
||||
// pthreads is noticably more efficient than GCD on OSX
|
||||
#define ISPC_USE_PTHREADS
|
||||
//#define ISPC_USE_GCD
|
||||
#endif
|
||||
|
||||
#define DBG(x)
|
||||
@@ -110,7 +112,7 @@ struct TaskInfo {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// TaskGroupBase
|
||||
|
||||
#define LOG_TASK_QUEUE_CHUNK_SIZE 14
|
||||
#define LOG_TASK_QUEUE_CHUNK_SIZE 12
|
||||
#define MAX_TASK_QUEUE_CHUNKS 8
|
||||
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
@@ -157,6 +159,7 @@ private:
|
||||
int memBufferSize[NUM_MEM_BUFFERS];
|
||||
char *memBuffers[NUM_MEM_BUFFERS];
|
||||
char mem[256];
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -8,10 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
|
||||
objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
|
||||
default: volume
|
||||
|
||||
@@ -23,8 +20,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ volume
|
||||
|
||||
volume: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -34,5 +31,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/volume.o: objs/volume_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "volume_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -69,6 +70,37 @@ writePPM(float *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Load image and viewing parameters from a camera data file.
|
||||
FIXME: we should add support to be able to specify viewing parameters
|
||||
in the program here directly. */
|
||||
@@ -140,6 +172,8 @@ int main(int argc, char *argv[]) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Load viewing data and the volume density data
|
||||
//
|
||||
|
||||
@@ -41,7 +41,7 @@ struct Ray {
|
||||
static void
|
||||
generateRay(const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
float x, float y, reference Ray ray) {
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||
@@ -70,7 +70,7 @@ Inside(float3 p, float3 pMin, float3 pMax) {
|
||||
|
||||
|
||||
static bool
|
||||
IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
|
||||
IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
|
||||
float t0 = -1e30, t1 = 1e30;
|
||||
|
||||
float3 tNear = (pMin - ray.origin) / ray.dir;
|
||||
@@ -141,7 +141,7 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
uniform float density[], uniform int nVoxels[3],
|
||||
uniform bool &checkForSameVoxel) {
|
||||
reference uniform bool checkForSameVoxel) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
@@ -155,8 +155,8 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
// Trilinearly interpolate density values to compute local density
|
||||
float d00, d10, d01, d11;
|
||||
uniform int uvx, uvy, uvz;
|
||||
if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) &&
|
||||
reduce_equal(vz, &uvz)) {
|
||||
if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
|
||||
reduce_equal(vz, uvz)) {
|
||||
// If all of the program instances are inside the same voxel, then
|
||||
// we'll call the 'uniform' variant of the voxel density lookup
|
||||
// function, thus doing a single load for each value rather than a
|
||||
@@ -310,7 +310,11 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
// by 4.
|
||||
for (uniform int y = y0; y < y1; y += 4) {
|
||||
for (uniform int x = x0; x < x1; x += 4) {
|
||||
foreach (o = 0 ... 16) {
|
||||
// For each such tile, process programCount pixels at a time,
|
||||
// until we've done all 16 of them. Thus, we're also assuming
|
||||
// that programCount <= 16 and that 16 is evenly dividible by
|
||||
// programCount.
|
||||
for (uniform int o = 0; o < 16; o += programCount) {
|
||||
// These two arrays encode the mapping from [0,15] to
|
||||
// offsets within the 4x4 pixel block so that we render
|
||||
// each pixel inside the block
|
||||
@@ -320,7 +324,8 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
|
||||
// Figure out the pixel to render for this program instance
|
||||
int xo = x + xoffsets[o], yo = y + yoffsets[o];
|
||||
int xo = x + xoffsets[o + programIndex];
|
||||
int yo = y + yoffsets[o + programIndex];
|
||||
|
||||
// Use viewing parameters to compute the corresponding ray
|
||||
// for the pixel
|
||||
@@ -347,7 +352,7 @@ volume_task(uniform float density[], uniform int nVoxels[3],
|
||||
uniform int ybuckets = (height + (dy-1)) / dy;
|
||||
|
||||
uniform int x0 = (taskIndex % xbuckets) * dx;
|
||||
uniform int y0 = (taskIndex / xbuckets) * dy;
|
||||
uniform int y0 = (taskIndex / ybuckets) * dy;
|
||||
uniform int x1 = x0 + dx, y1 = y0 + dy;
|
||||
x1 = min(x1, width);
|
||||
y1 = min(y1, height);
|
||||
|
||||
@@ -64,19 +64,15 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -85,7 +81,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -101,7 +96,6 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -119,7 +113,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -138,7 +131,6 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -156,18 +148,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="volume.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -36,6 +36,9 @@
|
||||
#include <algorithm>
|
||||
|
||||
// Just enough of a float3 class to do what we need in this file.
|
||||
#ifdef _MSC_VER
|
||||
__declspec(align(16))
|
||||
#endif
|
||||
struct float3 {
|
||||
float3() { }
|
||||
float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
|
||||
@@ -295,7 +298,7 @@ volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
|
||||
for (int y = 0; y < height; ++y) {
|
||||
for (int x = 0; x < width; ++x, ++offset) {
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, (float)x, (float)y, ray);
|
||||
generateRay(raster2camera, camera2world, x, y, ray);
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
}
|
||||
|
||||
154
expr.h
154
expr.h
@@ -39,9 +39,10 @@
|
||||
#define ISPC_EXPR_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include "ast.h"
|
||||
#include "type.h"
|
||||
|
||||
class FunctionSymbolExpr;
|
||||
|
||||
/** @brief Expr is the abstract base class that defines the interface that
|
||||
all expression types must implement.
|
||||
*/
|
||||
@@ -65,10 +66,6 @@ public:
|
||||
/** Returns the Type of the expression. */
|
||||
virtual const Type *GetType() const = 0;
|
||||
|
||||
/** Returns the type of the value returned by GetLValueType(); this
|
||||
should be a pointer type of some sort (uniform or varying). */
|
||||
virtual const Type *GetLValueType() const;
|
||||
|
||||
/** For expressions that have values based on a symbol (e.g. regular
|
||||
symbol references, array indexing, etc.), this returns a pointer to
|
||||
that symbol. */
|
||||
@@ -93,6 +90,14 @@ public:
|
||||
|
||||
/** Prints the expression to standard output (used for debugging). */
|
||||
virtual void Print() const = 0;
|
||||
|
||||
/** This method tries to convert the expression to the given type. In
|
||||
the event of failure, if the failureOk parameter is true, then no
|
||||
error is issued. If failureOk is false, then an error is printed
|
||||
that incorporates the given error message string. In either
|
||||
failure case, NULL is returned. */
|
||||
Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL,
|
||||
bool failureOk = false, bool issuePrecisionWarnings = true);
|
||||
};
|
||||
|
||||
|
||||
@@ -260,6 +265,10 @@ public:
|
||||
ExprList *args;
|
||||
bool isLaunch;
|
||||
Expr *launchCountExpr;
|
||||
|
||||
private:
|
||||
void resolveFunctionOverloads(bool exactMatchOnly);
|
||||
bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
|
||||
};
|
||||
|
||||
|
||||
@@ -270,12 +279,11 @@ public:
|
||||
*/
|
||||
class IndexExpr : public Expr {
|
||||
public:
|
||||
IndexExpr(Expr *baseExpr, Expr *index, SourcePos p);
|
||||
IndexExpr(Expr *arrayOrVector, Expr *index, SourcePos p);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
const Type *GetLValueType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
void Print() const;
|
||||
|
||||
@@ -283,7 +291,7 @@ public:
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
Expr *baseExpr, *index;
|
||||
Expr *arrayOrVector, *index;
|
||||
};
|
||||
|
||||
|
||||
@@ -293,35 +301,28 @@ public:
|
||||
*/
|
||||
class MemberExpr : public Expr {
|
||||
public:
|
||||
static MemberExpr *create(Expr *expr, const char *identifier,
|
||||
SourcePos pos, SourcePos identifierPos,
|
||||
bool derefLvalue);
|
||||
static MemberExpr* create(Expr *expr, const char *identifier,
|
||||
SourcePos pos, SourcePos identifierPos);
|
||||
|
||||
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
||||
SourcePos identifierPos);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
const Type *GetLValueType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
void Print() const;
|
||||
Expr *Optimize();
|
||||
Expr *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
virtual int getElementNumber() const = 0;
|
||||
virtual const Type *getElementType() const = 0;
|
||||
virtual int getElementNumber() const;
|
||||
|
||||
std::string getCandidateNearMatches() const;
|
||||
|
||||
Expr *expr;
|
||||
std::string identifier;
|
||||
const SourcePos identifierPos;
|
||||
|
||||
protected:
|
||||
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
||||
SourcePos identifierPos, bool derefLValue);
|
||||
|
||||
/** Indicates whether the expression should be dereferenced before the
|
||||
member is found. (i.e. this is true if the MemberExpr was a '->'
|
||||
operator, and is false if it was a '.' operator. */
|
||||
bool dereferenceExpr;
|
||||
};
|
||||
|
||||
|
||||
@@ -493,8 +494,7 @@ private:
|
||||
probably-different type. */
|
||||
class TypeCastExpr : public Expr {
|
||||
public:
|
||||
TypeCastExpr(const Type *t, Expr *e, bool preserveUniformity,
|
||||
SourcePos p);
|
||||
TypeCastExpr(const Type *t, Expr *e, SourcePos p);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
@@ -502,12 +502,9 @@ public:
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
llvm::Constant *GetConstant(const Type *type) const;
|
||||
|
||||
const Type *type;
|
||||
Expr *expr;
|
||||
bool preserveUniformity;
|
||||
};
|
||||
|
||||
|
||||
@@ -519,7 +516,6 @@ public:
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
const Type *GetLValueType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
@@ -539,7 +535,6 @@ public:
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
const Type *GetLValueType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
@@ -550,44 +545,6 @@ public:
|
||||
};
|
||||
|
||||
|
||||
/** Expression that represents taking the address of an expression. */
|
||||
class AddressOfExpr : public Expr {
|
||||
public:
|
||||
AddressOfExpr(Expr *e, SourcePos p);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
|
||||
Expr *expr;
|
||||
};
|
||||
|
||||
|
||||
/** Expression that returns the size of the given expression or type in
|
||||
bytes. */
|
||||
class SizeOfExpr : public Expr {
|
||||
public:
|
||||
SizeOfExpr(Expr *e, SourcePos p);
|
||||
SizeOfExpr(const Type *t, SourcePos p);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
void Print() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
int EstimateCost() const;
|
||||
|
||||
/* One of expr or type should be non-NULL (but not both of them). The
|
||||
SizeOfExpr returns the size of whichever one of them isn't NULL. */
|
||||
Expr *expr;
|
||||
const Type *type;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Expression representing a symbol reference in the program */
|
||||
class SymbolExpr : public Expr {
|
||||
public:
|
||||
@@ -596,7 +553,6 @@ public:
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
const Type *GetLValueType() const;
|
||||
Symbol *GetBaseSymbol() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
@@ -613,7 +569,7 @@ private:
|
||||
*/
|
||||
class FunctionSymbolExpr : public Expr {
|
||||
public:
|
||||
FunctionSymbolExpr(const char *name, const std::vector<Symbol *> &candFuncs,
|
||||
FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
|
||||
SourcePos pos);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
@@ -623,25 +579,9 @@ public:
|
||||
Expr *Optimize();
|
||||
void Print() const;
|
||||
int EstimateCost() const;
|
||||
llvm::Constant *GetConstant(const Type *type) const;
|
||||
|
||||
/** Given the types of the function arguments, in the presence of
|
||||
function overloading, this method resolves which actual function
|
||||
the arguments match best. If the argCouldBeNULL parameter is
|
||||
non-NULL, each element indicates whether the corresponding argument
|
||||
is the number zero, indicating that it could be a NULL pointer.
|
||||
This parameter may be NULL (for cases where overload resolution is
|
||||
being done just given type information without the parameter
|
||||
argument expressions being available. It returns true on success.
|
||||
*/
|
||||
bool ResolveOverloads(const std::vector<const Type *> &argTypes,
|
||||
const std::vector<bool> *argCouldBeNULL = NULL);
|
||||
Symbol *GetMatchingFunction();
|
||||
|
||||
private:
|
||||
bool tryResolve(int (*matchFunc)(const Type *, const Type *),
|
||||
const std::vector<const Type *> &argTypes,
|
||||
const std::vector<bool> *argCouldBeNULL);
|
||||
friend class FunctionCallExpr;
|
||||
|
||||
/** Name of the function that is being called. */
|
||||
std::string name;
|
||||
@@ -649,12 +589,11 @@ private:
|
||||
/** All of the functions with the name given in the function call;
|
||||
there may be more then one, in which case we need to resolve which
|
||||
overload is the best match. */
|
||||
std::vector<Symbol *> candidateFunctions;
|
||||
std::vector<Symbol *> *candidateFunctions;
|
||||
|
||||
/** The actual matching function found after overload resolution. */
|
||||
/** The actual matching function found after overload resolution; this
|
||||
value is set by FunctionCallExpr::resolveFunctionOverloads() */
|
||||
Symbol *matchingFunc;
|
||||
|
||||
bool triedToResolve;
|
||||
};
|
||||
|
||||
|
||||
@@ -672,37 +611,4 @@ public:
|
||||
int EstimateCost() const;
|
||||
};
|
||||
|
||||
|
||||
/** @brief An expression that represents a NULL pointer. */
|
||||
class NullPointerExpr : public Expr {
|
||||
public:
|
||||
NullPointerExpr(SourcePos p) : Expr(p) { }
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
void Print() const;
|
||||
int EstimateCost() const;
|
||||
};
|
||||
|
||||
|
||||
/** This function indicates whether it's legal to convert from fromType to
|
||||
toType. If the optional errorMsgBase and source position parameters
|
||||
are provided, then an error message is issued if the type conversion
|
||||
isn't possible.
|
||||
*/
|
||||
bool CanConvertTypes(const Type *fromType, const Type *toType,
|
||||
const char *errorMsgBase = NULL,
|
||||
SourcePos pos = SourcePos());
|
||||
|
||||
/** This function attempts to convert the given expression to the given
|
||||
type, returning a pointer to a new expression that is the result. If
|
||||
the required type conversion is illegal, it returns NULL and prints an
|
||||
error message using the provided string to indicate the context for
|
||||
which type conversion was being applied (e.g. "function call
|
||||
parameter").
|
||||
*/
|
||||
Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);
|
||||
|
||||
#endif // ISPC_EXPR_H
|
||||
|
||||
@@ -14,10 +14,10 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
|
||||
varying int3 vv = array[a];
|
||||
++vv.y;
|
||||
array[a] = vv;
|
||||
//CO print("fin %\n", array[programIndex].y);
|
||||
ret[programIndex] = array[programIndex].y;
|
||||
}
|
||||
|
||||
export void result(uniform float ret[]) {
|
||||
ret[programIndex] = 101+programIndex;
|
||||
ret[0] = 100;
|
||||
ret[programIndex] = 100+programIndex;
|
||||
}
|
||||
@@ -8,6 +8,9 @@ struct Foo {
|
||||
float y;
|
||||
};
|
||||
|
||||
extern void aa(reference Foo f);
|
||||
extern void bb(reference Foo f[]);
|
||||
|
||||
typedef float<3> float3;
|
||||
|
||||
void set(uniform float3 f[], int offset, float3 val) {
|
||||
414
func.cpp
414
func.cpp
@@ -1,414 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file func.cpp
|
||||
@brief
|
||||
*/
|
||||
|
||||
#include "func.h"
|
||||
#include "ctx.h"
|
||||
#include "expr.h"
|
||||
#include "llvmutil.h"
|
||||
#include "module.h"
|
||||
#include "type.h"
|
||||
#include "stmt.h"
|
||||
#include "sym.h"
|
||||
#include "util.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/PassManager.h>
|
||||
#include <llvm/PassRegistry.h>
|
||||
#include <llvm/Transforms/IPO.h>
|
||||
#include <llvm/Support/FormattedStream.h>
|
||||
#include <llvm/Support/FileUtilities.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/Target/TargetOptions.h>
|
||||
#include <llvm/Target/TargetData.h>
|
||||
#include <llvm/PassManager.h>
|
||||
#include <llvm/Analysis/Verifier.h>
|
||||
#include <llvm/Support/CFG.h>
|
||||
#include <llvm/Support/ToolOutputFile.h>
|
||||
#include <llvm/Assembly/PrintModulePass.h>
|
||||
|
||||
Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
|
||||
sym = s;
|
||||
args = a;
|
||||
code = c;
|
||||
|
||||
maskSymbol = m->symbolTable->LookupVariable("__mask");
|
||||
assert(maskSymbol != NULL);
|
||||
|
||||
if (code != NULL) {
|
||||
if (g->debugPrint) {
|
||||
fprintf(stderr, "Creating function \"%s\". Initial code:\n",
|
||||
sym->name.c_str());
|
||||
code->Print(0);
|
||||
fprintf(stderr, "---------------------\n");
|
||||
}
|
||||
|
||||
code = code->TypeCheck();
|
||||
|
||||
if (code != NULL && g->debugPrint) {
|
||||
fprintf(stderr, "After typechecking function \"%s\":\n",
|
||||
sym->name.c_str());
|
||||
code->Print(0);
|
||||
fprintf(stderr, "---------------------\n");
|
||||
}
|
||||
|
||||
if (code != NULL) {
|
||||
code = code->Optimize();
|
||||
if (g->debugPrint) {
|
||||
fprintf(stderr, "After optimizing function \"%s\":\n",
|
||||
sym->name.c_str());
|
||||
code->Print(0);
|
||||
fprintf(stderr, "---------------------\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (g->debugPrint) {
|
||||
printf("Add Function %s\n", sym->name.c_str());
|
||||
code->Print(0);
|
||||
printf("\n\n\n");
|
||||
}
|
||||
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
assert(type != NULL);
|
||||
|
||||
for (unsigned int i = 0; i < args.size(); ++i)
|
||||
if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
|
||||
args[i]->parentFunction = this;
|
||||
|
||||
if (type->isTask) {
|
||||
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
|
||||
assert(threadIndexSym);
|
||||
threadCountSym = m->symbolTable->LookupVariable("threadCount");
|
||||
assert(threadCountSym);
|
||||
taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
|
||||
assert(taskIndexSym);
|
||||
taskCountSym = m->symbolTable->LookupVariable("taskCount");
|
||||
assert(taskCountSym);
|
||||
}
|
||||
else
|
||||
threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Function::GetReturnType() const {
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
assert(type != NULL);
|
||||
return type->GetReturnType();
|
||||
}
|
||||
|
||||
|
||||
const FunctionType *
|
||||
Function::GetType() const {
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
assert(type != NULL);
|
||||
return type;
|
||||
}
|
||||
|
||||
|
||||
/** Parameters for tasks are stored in a big structure; this utility
|
||||
function emits code to copy those values out of the task structure into
|
||||
local stack-allocated variables. (Which we expect that LLVM's
|
||||
'mem2reg' pass will in turn promote to SSA registers..
|
||||
*/
|
||||
static void
|
||||
lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
|
||||
FunctionEmitContext *ctx) {
|
||||
// We expect the argument structure to come in as a poitner to a
|
||||
// structure. Confirm and figure out its type here.
|
||||
const llvm::Type *structArgType = structArgPtr->getType();
|
||||
assert(llvm::isa<llvm::PointerType>(structArgType));
|
||||
const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(structArgType);
|
||||
assert(llvm::isa<llvm::StructType>(pt->getElementType()));
|
||||
const llvm::StructType *argStructType =
|
||||
llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
|
||||
|
||||
// Get the type of the argument we're copying in and its Symbol pointer
|
||||
LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
|
||||
Symbol *sym = args[i];
|
||||
|
||||
// allocate space to copy the parameter in to
|
||||
sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
|
||||
|
||||
// get a pointer to the value in the struct
|
||||
llvm::Value *ptr = ctx->AddElementOffset(structArgPtr, i, NULL, sym->name.c_str());
|
||||
|
||||
// and copy the value from the struct and into the local alloca'ed
|
||||
// memory
|
||||
llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
|
||||
ctx->StoreInst(ptrval, sym->storagePtr);
|
||||
ctx->EmitFunctionParameterDebugInfo(sym);
|
||||
}
|
||||
|
||||
|
||||
/** Given the statements implementing a function, emit the code that
|
||||
implements the function. Most of the work do be done here just
|
||||
involves wiring up the function parameter values to be available in the
|
||||
function body code.
|
||||
*/
|
||||
void
|
||||
Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
SourcePos firstStmtPos) {
|
||||
llvm::Value *maskPtr = ctx->AllocaInst(LLVMTypes::MaskType, "mask_memory");
|
||||
ctx->StoreInst(LLVMMaskAllOn, maskPtr);
|
||||
maskSymbol->storagePtr = maskPtr;
|
||||
ctx->SetMaskPointer(maskPtr);
|
||||
|
||||
// add debugging info for __mask, programIndex, ...
|
||||
maskSymbol->pos = firstStmtPos;
|
||||
ctx->EmitVariableDebugInfo(maskSymbol);
|
||||
|
||||
#if 0
|
||||
llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
|
||||
#endif
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
assert(type != NULL);
|
||||
if (type->isTask == true) {
|
||||
// For tasks, we there should always be three parmeters: the
|
||||
// pointer to the structure that holds all of the arguments, the
|
||||
// thread index, and the thread count variables.
|
||||
llvm::Function::arg_iterator argIter = function->arg_begin();
|
||||
llvm::Value *structParamPtr = argIter++;
|
||||
llvm::Value *threadIndex = argIter++;
|
||||
llvm::Value *threadCount = argIter++;
|
||||
llvm::Value *taskIndex = argIter++;
|
||||
llvm::Value *taskCount = argIter++;
|
||||
|
||||
// Copy the function parameter values from the structure into local
|
||||
// storage
|
||||
for (unsigned int i = 0; i < args.size(); ++i)
|
||||
lCopyInTaskParameter(i, structParamPtr, args, ctx);
|
||||
|
||||
// Copy in the mask as well.
|
||||
int nArgs = (int)args.size();
|
||||
// The mask is the last parameter in the argument structure
|
||||
llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
|
||||
"task_struct_mask");
|
||||
llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
|
||||
ctx->SetFunctionMask(ptrval);
|
||||
|
||||
// Copy threadIndex and threadCount into stack-allocated storage so
|
||||
// that their symbols point to something reasonable.
|
||||
threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
|
||||
ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
|
||||
|
||||
threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
|
||||
ctx->StoreInst(threadCount, threadCountSym->storagePtr);
|
||||
|
||||
// Copy taskIndex and taskCount into stack-allocated storage so
|
||||
// that their symbols point to something reasonable.
|
||||
taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
|
||||
ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
|
||||
|
||||
taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
|
||||
ctx->StoreInst(taskCount, taskCountSym->storagePtr);
|
||||
}
|
||||
else {
|
||||
// Regular, non-task function
|
||||
llvm::Function::arg_iterator argIter = function->arg_begin();
|
||||
for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
|
||||
Symbol *sym = args[i];
|
||||
argIter->setName(sym->name.c_str());
|
||||
|
||||
// Allocate stack storage for the parameter and emit code
|
||||
// to store the its value there.
|
||||
sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
|
||||
ctx->StoreInst(argIter, sym->storagePtr);
|
||||
ctx->EmitFunctionParameterDebugInfo(sym);
|
||||
}
|
||||
|
||||
// If the number of actual function arguments is equal to the
|
||||
// number of declared arguments in decl->functionParams, then we
|
||||
// don't have a mask parameter, so set it to be all on. This
|
||||
// happens for exmaple with 'export'ed functions that the app
|
||||
// calls.
|
||||
if (argIter == function->arg_end())
|
||||
ctx->SetFunctionMask(LLVMMaskAllOn);
|
||||
else {
|
||||
// Otherwise use the mask to set the entry mask value
|
||||
argIter->setName("__mask");
|
||||
assert(argIter->getType() == LLVMTypes::MaskType);
|
||||
ctx->SetFunctionMask(argIter);
|
||||
assert(++argIter == function->arg_end());
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, we can generate code for the function
|
||||
if (code != NULL) {
|
||||
int costEstimate = code->EstimateCost();
|
||||
bool checkMask = (type->isTask == true) ||
|
||||
((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
|
||||
costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
|
||||
Debug(code->pos, "Estimated cost for function \"%s\" = %d\n",
|
||||
sym->name.c_str(), costEstimate);
|
||||
// If the body of the function is non-trivial, then we wrap the
|
||||
// entire thing around a varying "cif (true)" test in order to reap
|
||||
// the side-effect benefit of checking to see if the execution mask
|
||||
// is all on and thence having a specialized code path for that
|
||||
// case. If this is a simple function, then this isn't worth the
|
||||
// code bloat / overhead.
|
||||
if (checkMask) {
|
||||
bool allTrue[ISPC_MAX_NVEC];
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
allTrue[i] = true;
|
||||
Expr *trueExpr = new ConstExpr(AtomicType::VaryingBool, allTrue,
|
||||
code->pos);
|
||||
code = new IfStmt(trueExpr, code, NULL, true, code->pos);
|
||||
}
|
||||
|
||||
ctx->SetDebugPos(code->pos);
|
||||
ctx->AddInstrumentationPoint("function entry");
|
||||
code->EmitCode(ctx);
|
||||
}
|
||||
|
||||
if (ctx->GetCurrentBasicBlock()) {
|
||||
// FIXME: We'd like to issue a warning if we've reached the end of
|
||||
// the function without a return statement (for non-void
|
||||
// functions). But the test below isn't right, since we can have
|
||||
// (with 'x' a varying test) "if (x) return a; else return b;", in
|
||||
// which case we have a valid basic block but its unreachable so ok
|
||||
// to not have return statement.
|
||||
#if 0
|
||||
// If the bblock has no predecessors, then it doesn't matter if it
|
||||
// doesn't have a return; it'll never be reached. If it does,
|
||||
// issue a warning. Also need to warn if it's the entry block for
|
||||
// the function (in which case it will not have predeccesors but is
|
||||
// still reachable.)
|
||||
if (type->GetReturnType() != AtomicType::Void &&
|
||||
(pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
|
||||
Warning(sym->pos, "Missing return statement in function returning \"%s\".",
|
||||
type->rType->GetString().c_str());
|
||||
#endif
|
||||
|
||||
// FIXME: would like to set the context's current position to
|
||||
// e.g. the end of the function code
|
||||
|
||||
// if bblock is non-NULL, it hasn't been terminated by e.g. a
|
||||
// return instruction. Need to add a return instruction.
|
||||
ctx->ReturnInst();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Function::GenerateIR() {
|
||||
if (sym == NULL)
|
||||
// May be NULL due to error earlier in compilation
|
||||
return;
|
||||
|
||||
llvm::Function *function = sym->function;
|
||||
assert(function != NULL);
|
||||
|
||||
// But if that function has a definition, we don't want to redefine it.
|
||||
if (function->empty() == false) {
|
||||
Error(sym->pos, "Ignoring redefinition of function \"%s\".",
|
||||
sym->name.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
// Figure out a reasonable source file position for the start of the
|
||||
// function body. If possible, get the position of the first actual
|
||||
// non-StmtList statment...
|
||||
SourcePos firstStmtPos = sym->pos;
|
||||
if (code) {
|
||||
StmtList *sl = dynamic_cast<StmtList *>(code);
|
||||
if (sl && sl->GetStatements().size() > 0 &&
|
||||
sl->GetStatements()[0] != NULL)
|
||||
firstStmtPos = sl->GetStatements()[0]->pos;
|
||||
else
|
||||
firstStmtPos = code->pos;
|
||||
}
|
||||
|
||||
// And we can now go ahead and emit the code
|
||||
{
|
||||
FunctionEmitContext ec(this, sym, function, firstStmtPos);
|
||||
emitCode(&ec, function, firstStmtPos);
|
||||
}
|
||||
|
||||
if (m->errorCount == 0) {
|
||||
if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
|
||||
if (g->debugPrint)
|
||||
function->dump();
|
||||
FATAL("Function verificication failed");
|
||||
}
|
||||
|
||||
// If the function is 'export'-qualified, emit a second version of
|
||||
// it without a mask parameter and without name mangling so that
|
||||
// the application can call it
|
||||
const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
|
||||
assert(type != NULL);
|
||||
if (type->isExported) {
|
||||
if (!type->isTask) {
|
||||
LLVM_TYPE_CONST llvm::FunctionType *ftype =
|
||||
type->LLVMFunctionType(g->ctx);
|
||||
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
|
||||
std::string functionName = sym->name;
|
||||
if (g->mangleFunctionsWithTarget)
|
||||
functionName += std::string("_") + g->target.GetISAString();
|
||||
llvm::Function *appFunction =
|
||||
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
|
||||
appFunction->setDoesNotThrow(true);
|
||||
|
||||
if (appFunction->getName() != functionName) {
|
||||
// this was a redefinition for which we already emitted an
|
||||
// error, so don't worry about this one...
|
||||
appFunction->eraseFromParent();
|
||||
}
|
||||
else {
|
||||
// And emit the code again
|
||||
FunctionEmitContext ec(this, sym, appFunction, firstStmtPos);
|
||||
emitCode(&ec, appFunction, firstStmtPos);
|
||||
if (m->errorCount == 0) {
|
||||
sym->exportedFunction = appFunction;
|
||||
if (llvm::verifyFunction(*appFunction,
|
||||
llvm::ReturnStatusAction) == true) {
|
||||
if (g->debugPrint)
|
||||
appFunction->dump();
|
||||
FATAL("Function verificication failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
66
func.h
66
func.h
@@ -1,66 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file func.h
|
||||
@brief Representation of a function in a source file.
|
||||
*/
|
||||
|
||||
#ifndef ISPC_FUNC_H
|
||||
#define ISPC_FUNC_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <vector>
|
||||
|
||||
class Function {
|
||||
public:
|
||||
Function(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code);
|
||||
|
||||
const Type *GetReturnType() const;
|
||||
const FunctionType *GetType() const;
|
||||
|
||||
/** Generate LLVM IR for the function into the current module. */
|
||||
void GenerateIR();
|
||||
|
||||
private:
|
||||
void emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
SourcePos firstStmtPos);
|
||||
|
||||
Symbol *sym;
|
||||
std::vector<Symbol *> args;
|
||||
Stmt *code;
|
||||
Symbol *maskSymbol;
|
||||
Symbol *threadIndexSym, *threadCountSym;
|
||||
Symbol *taskIndexSym, *taskCountSym;
|
||||
};
|
||||
|
||||
#endif // ISPC_FUNC_H
|
||||
119
ispc.cpp
119
ispc.cpp
@@ -38,7 +38,6 @@
|
||||
#include "ispc.h"
|
||||
#include "module.h"
|
||||
#include "util.h"
|
||||
#include "llvmutil.h"
|
||||
#include <stdio.h>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#include <windows.h>
|
||||
@@ -53,7 +52,7 @@
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/Target/TargetOptions.h>
|
||||
#include <llvm/Target/TargetData.h>
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
#include <llvm/Support/TargetRegistry.h>
|
||||
#include <llvm/Support/TargetSelect.h>
|
||||
#else
|
||||
@@ -75,7 +74,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
if (cpu == NULL) {
|
||||
std::string hostCPU = llvm::sys::getHostCPUName();
|
||||
if (hostCPU.size() > 0)
|
||||
cpu = strdup(hostCPU.c_str());
|
||||
cpu = hostCPU.c_str();
|
||||
else {
|
||||
fprintf(stderr, "Warning: unable to determine host CPU!\n");
|
||||
cpu = "generic";
|
||||
@@ -86,7 +85,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
if (isa == NULL) {
|
||||
if (!strcasecmp(cpu, "atom"))
|
||||
isa = "sse2";
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
else if (!strcasecmp(cpu, "sandybridge") ||
|
||||
!strcasecmp(cpu, "corei7-avx"))
|
||||
isa = "avx";
|
||||
@@ -130,25 +129,19 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->vectorWidth = 4;
|
||||
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse2-x2")) {
|
||||
t->isa = Target::SSE2;
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse4")) {
|
||||
t->isa = Target::SSE4;
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 4;
|
||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
|
||||
else if (!strcasecmp(isa, "sse4x2")) {
|
||||
t->isa = Target::SSE4;
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||
}
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
else if (!strcasecmp(isa, "avx")) {
|
||||
t->isa = Target::AVX;
|
||||
t->nativeVectorWidth = 8;
|
||||
@@ -171,7 +164,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
if (!error) {
|
||||
llvm::TargetMachine *targetMachine = t->GetTargetMachine();
|
||||
const llvm::TargetData *targetData = targetMachine->getTargetData();
|
||||
t->is32Bit = (targetData->getPointerSize() == 4);
|
||||
t->is32bit = (targetData->getPointerSize() == 4);
|
||||
}
|
||||
|
||||
return !error;
|
||||
@@ -181,7 +174,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
const char *
|
||||
Target::SupportedTargetCPUs() {
|
||||
return "atom, barcelona, core2, corei7, "
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
"corei7-avx, "
|
||||
#endif
|
||||
"istanbul, nocona, penryn, "
|
||||
@@ -200,8 +193,8 @@ Target::SupportedTargetArchs() {
|
||||
|
||||
const char *
|
||||
Target::SupportedTargetISAs() {
|
||||
return "sse2, sse2-x2, sse4, sse4-x2"
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
return "sse2, sse4, sse4x2"
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
", avx, avx-x2"
|
||||
#endif
|
||||
;
|
||||
@@ -212,11 +205,7 @@ std::string
|
||||
Target::GetTripleString() const {
|
||||
llvm::Triple triple;
|
||||
// Start with the host triple as the default
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
|
||||
triple.setTriple(llvm::sys::getDefaultTargetTriple());
|
||||
#else
|
||||
triple.setTriple(llvm::sys::getHostTriple());
|
||||
#endif
|
||||
|
||||
// And override the arch in the host triple based on what the user
|
||||
// specified. Here we need to deal with the fact that LLVM uses one
|
||||
@@ -241,7 +230,7 @@ Target::GetTargetMachine() const {
|
||||
|
||||
llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ :
|
||||
llvm::Reloc::Default;
|
||||
#if defined(LLVM_3_0svn) || defined(LLVM_3_1svn) || defined(LLVM_3_0)
|
||||
#if defined(LLVM_3_0svn) || defined(LLVM_3_0)
|
||||
std::string featuresString = attributes;
|
||||
llvm::TargetMachine *targetMachine =
|
||||
target->createTargetMachine(triple, cpu, featuresString, relocModel);
|
||||
@@ -263,53 +252,6 @@ Target::GetTargetMachine() const {
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
Target::GetISAString() const {
|
||||
switch (isa) {
|
||||
case Target::SSE2:
|
||||
return "sse2";
|
||||
case Target::SSE4:
|
||||
return "sse4";
|
||||
case Target::AVX:
|
||||
return "avx";
|
||||
break;
|
||||
default:
|
||||
FATAL("Unhandled target in GetISAString()");
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
|
||||
const llvm::TargetData *td = GetTargetMachine()->getTargetData();
|
||||
assert(td != NULL);
|
||||
uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
|
||||
if (is32Bit || g->opt.force32BitAddressing)
|
||||
return LLVMInt32(byteSize);
|
||||
else
|
||||
return LLVMInt64(byteSize);
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element) {
|
||||
const llvm::TargetData *td = GetTargetMachine()->getTargetData();
|
||||
assert(td != NULL);
|
||||
LLVM_TYPE_CONST llvm::StructType *structType =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
|
||||
assert(structType != NULL);
|
||||
const llvm::StructLayout *sl = td->getStructLayout(structType);
|
||||
assert(sl != NULL);
|
||||
|
||||
uint64_t offset = sl->getElementOffset(element);
|
||||
if (is32Bit || g->opt.force32BitAddressing)
|
||||
return LLVMInt32(offset);
|
||||
else
|
||||
return LLVMInt64(offset);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Opt
|
||||
|
||||
@@ -317,10 +259,7 @@ Opt::Opt() {
|
||||
level = 1;
|
||||
fastMath = false;
|
||||
fastMaskedVload = false;
|
||||
force32BitAddressing = true;
|
||||
unrollLoops = true;
|
||||
disableAsserts = false;
|
||||
disableHandlePseudoMemoryOps = false;
|
||||
disableBlendedMaskedStores = false;
|
||||
disableCoherentControlFlow = false;
|
||||
disableUniformControlFlow = false;
|
||||
@@ -341,37 +280,35 @@ Globals::Globals() {
|
||||
runCPP = true;
|
||||
debugPrint = false;
|
||||
disableWarnings = false;
|
||||
warningsAsErrors = false;
|
||||
disableLineWrap = false;
|
||||
emitPerfWarnings = true;
|
||||
emitInstrumentation = false;
|
||||
generateDebuggingSymbols = false;
|
||||
mangleFunctionsWithTarget = false;
|
||||
|
||||
ctx = new llvm::LLVMContext;
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_getcwd(currentDirectory, sizeof(currentDirectory));
|
||||
#else
|
||||
if (getcwd(currentDirectory, sizeof(currentDirectory)) == NULL)
|
||||
FATAL("Current directory path too long!");
|
||||
getcwd(currentDirectory, sizeof(currentDirectory));
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// ASTNode
|
||||
|
||||
ASTNode::~ASTNode() {
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// SourcePos
|
||||
|
||||
SourcePos::SourcePos(const char *n, int fl, int fc, int ll, int lc) {
|
||||
SourcePos::SourcePos(const char *n, int l, int c) {
|
||||
name = n ? n : m->module->getModuleIdentifier().c_str();
|
||||
first_line = fl;
|
||||
first_column = fc;
|
||||
last_line = ll != 0 ? ll : fl;
|
||||
last_column = lc != 0 ? lc : fc;
|
||||
first_line = last_line = l;
|
||||
first_column = last_column = c;
|
||||
}
|
||||
|
||||
|
||||
llvm::DIFile
|
||||
SourcePos::GetDIFile() const {
|
||||
llvm::DIFile SourcePos::GetDIFile() const {
|
||||
std::string directory, filename;
|
||||
GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
|
||||
return m->diBuilder->createFile(filename, directory);
|
||||
@@ -394,17 +331,3 @@ SourcePos::operator==(const SourcePos &p2) const {
|
||||
last_column == p2.last_column);
|
||||
}
|
||||
|
||||
|
||||
SourcePos
|
||||
Union(const SourcePos &p1, const SourcePos &p2) {
|
||||
if (strcmp(p1.name, p2.name) != 0)
|
||||
return p1;
|
||||
|
||||
SourcePos ret;
|
||||
ret.name = p1.name;
|
||||
ret.first_line = std::min(p1.first_line, p2.first_line);
|
||||
ret.first_column = std::min(p1.first_column, p2.first_column);
|
||||
ret.last_line = std::max(p1.last_line, p2.last_line);
|
||||
ret.last_column = std::max(p1.last_column, p2.last_column);
|
||||
return ret;
|
||||
}
|
||||
|
||||
103
ispc.h
103
ispc.h
@@ -38,10 +38,6 @@
|
||||
#ifndef ISPC_H
|
||||
#define ISPC_H
|
||||
|
||||
#if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
|
||||
#error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
@@ -80,7 +76,7 @@ namespace llvm {
|
||||
}
|
||||
|
||||
// llvm::Type *s are no longer const in llvm 3.0
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
#define LLVM_TYPE_CONST
|
||||
#else
|
||||
#define LLVM_TYPE_CONST const
|
||||
@@ -88,17 +84,19 @@ namespace llvm {
|
||||
|
||||
class ArrayType;
|
||||
class AtomicType;
|
||||
class DeclSpecs;
|
||||
class Declaration;
|
||||
class Declarator;
|
||||
class FunctionEmitContext;
|
||||
class Expr;
|
||||
class ExprList;
|
||||
class Function;
|
||||
class FunctionType;
|
||||
class GatherBuffer;
|
||||
class Module;
|
||||
class Stmt;
|
||||
class Symbol;
|
||||
class SymbolTable;
|
||||
class Type;
|
||||
struct VariableDeclaration;
|
||||
|
||||
/** @brief Representation of a range of positions in a source file.
|
||||
|
||||
@@ -108,8 +106,7 @@ struct VariableDeclaration;
|
||||
lexing code). Both lines and columns are counted starting from one.
|
||||
*/
|
||||
struct SourcePos {
|
||||
SourcePos(const char *n = NULL, int fl = 0, int fc = 0,
|
||||
int ll = 0, int lc = 0);
|
||||
SourcePos(const char *n = NULL, int l = 0, int c = 0);
|
||||
|
||||
const char *name;
|
||||
int first_line;
|
||||
@@ -126,10 +123,37 @@ struct SourcePos {
|
||||
bool operator==(const SourcePos &p2) const;
|
||||
};
|
||||
|
||||
/** Returns a SourcePos that encompasses the extent of both of the given
|
||||
extents. */
|
||||
SourcePos Union(const SourcePos &p1, const SourcePos &p2);
|
||||
|
||||
/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
|
||||
|
||||
This class defines a basic interface that all abstract syntax tree
|
||||
(AST) nodes must implement. The base classes for both expressions
|
||||
(Expr) and statements (Stmt) inherit from this class.
|
||||
*/
|
||||
class ASTNode {
|
||||
public:
|
||||
ASTNode(SourcePos p) : pos(p) { }
|
||||
virtual ~ASTNode();
|
||||
|
||||
/** The Optimize() method should perform any appropriate early-stage
|
||||
optimizations on the node (e.g. constant folding). The caller
|
||||
should use the returned ASTNode * in place of the original node.
|
||||
This method may return NULL if an error is encountered during
|
||||
optimization. */
|
||||
virtual ASTNode *Optimize() = 0;
|
||||
|
||||
/** Type checking should be performed by the node when this method is
|
||||
called. In the event of an error, a NULL value may be returned.
|
||||
As with ASTNode::Optimize(), the caller should store the returned
|
||||
pointer in place of the original ASTNode *. */
|
||||
virtual ASTNode *TypeCheck() = 0;
|
||||
|
||||
virtual int EstimateCost() const = 0;
|
||||
|
||||
/** All AST nodes must track the file position where they are
|
||||
defined. */
|
||||
const SourcePos pos;
|
||||
};
|
||||
|
||||
/** @brief Structure that defines a compilation target
|
||||
|
||||
@@ -161,28 +185,13 @@ struct Target {
|
||||
/** Returns the LLVM TargetMachine object corresponding to this
|
||||
target. */
|
||||
llvm::TargetMachine *GetTargetMachine() const;
|
||||
|
||||
/** Returns a string like "avx" encoding the target. */
|
||||
const char *GetISAString() const;
|
||||
|
||||
/** Returns the size of the given type */
|
||||
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type);
|
||||
/** Given a structure type and an element number in the structure,
|
||||
returns a value corresponding to the number of bytes from the start
|
||||
of the structure where the element is located. */
|
||||
llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
|
||||
int element);
|
||||
|
||||
/** llvm Target object representing this target. */
|
||||
const llvm::Target *target;
|
||||
|
||||
/** Enumerator giving the instruction sets that the compiler can
|
||||
target. These should be ordered from "worse" to "better" in that
|
||||
if a processor supports multiple target ISAs, then the most
|
||||
flexible/performant of them will apear last in the enumerant. Note
|
||||
also that __best_available_isa() needs to be updated if ISAs are
|
||||
added or the enumerant values are reordered. */
|
||||
enum ISA { SSE2, SSE4, AVX, NUM_ISAS };
|
||||
target. */
|
||||
enum ISA { SSE2, SSE4, AVX };
|
||||
|
||||
/** Instruction set being compiled to. */
|
||||
ISA isa;
|
||||
@@ -191,7 +200,7 @@ struct Target {
|
||||
std::string arch;
|
||||
|
||||
/** Is the target architecture 32 or 64 bit */
|
||||
bool is32Bit;
|
||||
bool is32bit;
|
||||
|
||||
/** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
|
||||
std::string cpu;
|
||||
@@ -241,22 +250,6 @@ struct Opt {
|
||||
it will make sense. */
|
||||
bool unrollLoops;
|
||||
|
||||
/** Indicates if addressing math will be done with 32-bit math, even on
|
||||
64-bit systems. (This is generally noticably more efficient,
|
||||
though at the cost of addressing >2GB).
|
||||
*/
|
||||
bool force32BitAddressing;
|
||||
|
||||
/** Indicates whether assert() statements should be ignored (for
|
||||
performance in the generated code). */
|
||||
bool disableAsserts;
|
||||
|
||||
/** If enabled, the various __pseudo* memory ops (gather/scatter,
|
||||
masked load/store) are left in their __pseudo* form, for better
|
||||
understanding of the structure of generated code when reading
|
||||
it. */
|
||||
bool disableHandlePseudoMemoryOps;
|
||||
|
||||
/** On targets that don't have a masked store instruction but do have a
|
||||
blending instruction, by default, we simulate masked stores by
|
||||
loading the old value, blending, and storing the result. This can
|
||||
@@ -348,13 +341,6 @@ struct Globals {
|
||||
/** Indicates whether all warning messages should be surpressed. */
|
||||
bool disableWarnings;
|
||||
|
||||
/** Indicates whether warnings should be issued as errors. */
|
||||
bool warningsAsErrors;
|
||||
|
||||
/** Indicates whether line wrapping of error messages to the terminal
|
||||
width should be disabled. */
|
||||
bool disableLineWrap;
|
||||
|
||||
/** Indicates whether additional warnings should be issued about
|
||||
possible performance pitfalls. */
|
||||
bool emitPerfWarnings;
|
||||
@@ -368,10 +354,6 @@ struct Globals {
|
||||
/** Indicates whether ispc should generate debugging symbols for the
|
||||
program in its output. */
|
||||
bool generateDebuggingSymbols;
|
||||
|
||||
/** If true, function names are mangled by appending the target ISA and
|
||||
vector width to them. */
|
||||
bool mangleFunctionsWithTarget;
|
||||
|
||||
/** Global LLVMContext object */
|
||||
llvm::LLVMContext *ctx;
|
||||
@@ -391,8 +373,6 @@ enum {
|
||||
COST_COMPLEX_ARITH_OP = 4,
|
||||
COST_DEREF = 4,
|
||||
COST_FUNCALL = 4,
|
||||
COST_FUNPTR_UNIFORM = 12,
|
||||
COST_FUNPTR_VARYING = 24,
|
||||
COST_GATHER = 8,
|
||||
COST_LOAD = 2,
|
||||
COST_REGULAR_BREAK_CONTINUE = 2,
|
||||
@@ -400,14 +380,11 @@ enum {
|
||||
COST_SELECT = 4,
|
||||
COST_SIMPLE_ARITH_LOGIC_OP = 1,
|
||||
COST_SYNC = 32,
|
||||
COST_TASK_LAUNCH = 32,
|
||||
COST_TASK_LAUNCH = 16,
|
||||
COST_TYPECAST_COMPLEX = 4,
|
||||
COST_TYPECAST_SIMPLE = 1,
|
||||
COST_UNIFORM_IF = 2,
|
||||
COST_VARYING_IF = 3,
|
||||
COST_UNIFORM_LOOP = 4,
|
||||
COST_VARYING_LOOP = 6,
|
||||
COST_ASSERT = 8,
|
||||
|
||||
CHECK_MASK_AT_FUNCTION_START_COST = 16,
|
||||
PREDICATE_SAFE_IF_STATEMENT_COST = 6,
|
||||
|
||||
78
ispc.vcxproj
78
ispc.vcxproj
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -11,35 +11,25 @@
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ast.cpp" />
|
||||
<ClCompile Include="builtins.cpp" />
|
||||
<ClCompile Include="ctx.cpp" />
|
||||
<ClCompile Include="decl.cpp" />
|
||||
<ClCompile Include="expr.cpp" />
|
||||
<ClCompile Include="func.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
||||
<ClCompile Include="gen-bitcode-dispatch.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse2-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4x2.cpp" />
|
||||
<ClCompile Include="gen-stdlib.cpp" />
|
||||
<ClCompile Include="ispc.cpp" />
|
||||
<ClCompile Include="lex.cc">
|
||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
|
||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<ClCompile Include="lex.cc" />
|
||||
<ClCompile Include="llvmutil.cpp" />
|
||||
<ClCompile Include="module.cpp" />
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="opt.cpp" />
|
||||
<ClCompile Include="parse.cc">
|
||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
|
||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<ClCompile Include="parse.cc" />
|
||||
<CustomBuild Include="builtins-c.c">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
||||
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
||||
@@ -56,12 +46,10 @@
|
||||
<ClCompile Include="util.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="ast.h" />
|
||||
<ClInclude Include="builtins.h" />
|
||||
<ClInclude Include="ctx.h" />
|
||||
<ClInclude Include="decl.h" />
|
||||
<ClInclude Include="expr.h" />
|
||||
<ClInclude Include="func.h" />
|
||||
<ClInclude Include="ispc.h" />
|
||||
<ClInclude Include="llvmutil.h" />
|
||||
<ClInclude Include="module.h" />
|
||||
@@ -88,38 +76,25 @@
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins-dispatch.ll">
|
||||
<CustomBuild Include="builtins-sse4x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins-sse4-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
@@ -127,36 +102,23 @@
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins-sse2-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll > gen-bitcode-sse2-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins-avx.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
</CustomBuild>
|
||||
@@ -268,4 +230,4 @@
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -74,7 +74,7 @@ extern "C" {
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/ExecutionEngine/ExecutionEngine.h>
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
#include <llvm/Support/TargetRegistry.h>
|
||||
#include <llvm/Support/TargetSelect.h>
|
||||
#else
|
||||
@@ -120,7 +120,7 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
|
||||
*handle = (void *)0xdeadbeef;
|
||||
// leak time!
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc((size_t)size, alignment);
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
@@ -182,7 +182,7 @@ static bool lRunTest(const char *fn) {
|
||||
}
|
||||
|
||||
std::string eeError;
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
llvm::EngineBuilder engineBuilder(module);
|
||||
engineBuilder.setErrorStr(&eeError);
|
||||
engineBuilder.setEngineKind(llvm::EngineKind::JIT);
|
||||
@@ -361,7 +361,7 @@ static bool lRunTest(const char *fn) {
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
llvm::InitializeNativeTarget();
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
LLVMLinkInJIT();
|
||||
#endif
|
||||
|
||||
|
||||
@@ -54,7 +54,6 @@
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -73,7 +72,6 @@
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
|
||||
18
lex.ll
18
lex.ll
@@ -34,13 +34,13 @@
|
||||
%{
|
||||
|
||||
#include "ispc.h"
|
||||
#include "decl.h"
|
||||
#include "sym.h"
|
||||
#include "util.h"
|
||||
#include "module.h"
|
||||
#include "type.h"
|
||||
#include "parse.hh"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static uint64_t lParseBinary(const char *ptr, SourcePos pos);
|
||||
static void lCComment(SourcePos *);
|
||||
@@ -78,7 +78,6 @@ ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
|
||||
"/*" { lCComment(yylloc); }
|
||||
"//" { lCppComment(yylloc); }
|
||||
|
||||
__assert { return TOKEN_ASSERT; }
|
||||
bool { return TOKEN_BOOL; }
|
||||
break { return TOKEN_BREAK; }
|
||||
case { return TOKEN_CASE; }
|
||||
@@ -86,6 +85,7 @@ cbreak { return TOKEN_CBREAK; }
|
||||
ccontinue { return TOKEN_CCONTINUE; }
|
||||
cdo { return TOKEN_CDO; }
|
||||
cfor { return TOKEN_CFOR; }
|
||||
char { return TOKEN_CHAR; }
|
||||
cif { return TOKEN_CIF; }
|
||||
cwhile { return TOKEN_CWHILE; }
|
||||
const { return TOKEN_CONST; }
|
||||
@@ -101,8 +101,6 @@ extern { return TOKEN_EXTERN; }
|
||||
false { return TOKEN_FALSE; }
|
||||
float { return TOKEN_FLOAT; }
|
||||
for { return TOKEN_FOR; }
|
||||
foreach { return TOKEN_FOREACH; }
|
||||
foreach_tiled { return TOKEN_FOREACH_TILED; }
|
||||
goto { return TOKEN_GOTO; }
|
||||
if { return TOKEN_IF; }
|
||||
inline { return TOKEN_INLINE; }
|
||||
@@ -112,15 +110,10 @@ int16 { return TOKEN_INT16; }
|
||||
int32 { return TOKEN_INT; }
|
||||
int64 { return TOKEN_INT64; }
|
||||
launch { return TOKEN_LAUNCH; }
|
||||
NULL { return TOKEN_NULL; }
|
||||
print { return TOKEN_PRINT; }
|
||||
reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
|
||||
"please use C++-style '&' syntax for references "
|
||||
"instead."); }
|
||||
reference { return TOKEN_REFERENCE; }
|
||||
return { return TOKEN_RETURN; }
|
||||
soa { return TOKEN_SOA; }
|
||||
signed { return TOKEN_SIGNED; }
|
||||
sizeof { return TOKEN_SIZEOF; }
|
||||
static { return TOKEN_STATIC; }
|
||||
struct { return TOKEN_STRUCT; }
|
||||
switch { return TOKEN_SWITCH; }
|
||||
@@ -133,8 +126,6 @@ unsigned { return TOKEN_UNSIGNED; }
|
||||
varying { return TOKEN_VARYING; }
|
||||
void { return TOKEN_VOID; }
|
||||
while { return TOKEN_WHILE; }
|
||||
\"C\" { return TOKEN_STRING_C_LITERAL; }
|
||||
\.\.\. { return TOKEN_DOTDOTDOT; }
|
||||
|
||||
L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
|
||||
|
||||
@@ -230,7 +221,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
||||
"&=" { return TOKEN_AND_ASSIGN; }
|
||||
"^=" { return TOKEN_XOR_ASSIGN; }
|
||||
"|=" { return TOKEN_OR_ASSIGN; }
|
||||
"->" { return TOKEN_PTR_OP; }
|
||||
";" { return ';'; }
|
||||
("{"|"<%") { return '{'; }
|
||||
("}"|"%>") { return '}'; }
|
||||
@@ -274,6 +264,8 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
||||
|
||||
%%
|
||||
|
||||
/*sizeof { return TOKEN_SIZEOF; }*/
|
||||
/*"->" { return TOKEN_PTR_OP; }*/
|
||||
/*short { return TOKEN_SHORT; }*/
|
||||
/*long { return TOKEN_LONG; }*/
|
||||
/*signed { return TOKEN_SIGNED; }*/
|
||||
|
||||
17
llvmutil.cpp
17
llvmutil.cpp
@@ -40,7 +40,6 @@
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
|
||||
LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::PointerIntType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
|
||||
@@ -75,7 +74,7 @@ LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
|
||||
|
||||
LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;
|
||||
LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
|
||||
|
||||
llvm::Constant *LLVMTrue = NULL;
|
||||
llvm::Constant *LLVMFalse = NULL;
|
||||
@@ -87,8 +86,6 @@ void
|
||||
InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
|
||||
LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
|
||||
LLVMTypes::PointerIntType = target.is32Bit ? llvm::Type::getInt32Ty(*ctx) :
|
||||
llvm::Type::getInt64Ty(*ctx);
|
||||
|
||||
LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
|
||||
LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
|
||||
@@ -133,8 +130,8 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
|
||||
LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
|
||||
|
||||
LLVMTypes::VoidPointerVectorType = g->target.is32Bit ? LLVMTypes::Int32VectorType :
|
||||
LLVMTypes::Int64VectorType;
|
||||
LLVMTypes::VoidPointerVectorType =
|
||||
llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
|
||||
|
||||
LLVMTrue = llvm::ConstantInt::getTrue(*ctx);
|
||||
LLVMFalse = llvm::ConstantInt::getFalse(*ctx);
|
||||
@@ -454,3 +451,11 @@ LLVMBoolVector(const bool *bvec) {
|
||||
}
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
LLVM_TYPE_CONST llvm::ArrayType *
|
||||
LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t) {
|
||||
// NOTE: ArrayType, not VectorType
|
||||
return llvm::ArrayType::get(llvm::PointerType::get(t, 0),
|
||||
g->target.vectorWidth);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user