Compare commits
49 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
790dba2558 | ||
|
|
ce7355f9ed | ||
|
|
6b4459d402 | ||
|
|
4a2cbf2c4e | ||
|
|
53dd65fa2e | ||
|
|
f5afa52fd9 | ||
|
|
f9c67ff806 | ||
|
|
ec5e627e56 | ||
|
|
ff2a43ac19 | ||
|
|
9feea32471 | ||
|
|
bedaec2295 | ||
|
|
a68d137df6 | ||
|
|
59caa3d4e1 | ||
|
|
06975bc7ab | ||
|
|
880cbb18cc | ||
|
|
686d9975b6 | ||
|
|
9b7f55a28e | ||
|
|
e4d224a0f1 | ||
|
|
0933a77c1b | ||
|
|
5f78edf07a | ||
|
|
a6fc657b40 | ||
|
|
fa5050d5c7 | ||
|
|
d5a48d9a1e | ||
|
|
2df9da2524 | ||
|
|
0b02f94988 | ||
|
|
65c50b60fc | ||
|
|
9de34eb22c | ||
|
|
f8f25a11b6 | ||
|
|
cb7976bbf6 | ||
|
|
5ee4d7fce8 | ||
|
|
8f3e46f67e | ||
|
|
9ed07ff2b5 | ||
|
|
32a0a30cf5 | ||
|
|
6d39d5fc3e | ||
|
|
c999c8a237 | ||
|
|
aad269fdf4 | ||
|
|
d45c536c47 | ||
|
|
f1b8e5b1bf | ||
|
|
e7a70b05af | ||
|
|
cf73286938 | ||
|
|
e6f80c0adc | ||
|
|
5e31d7b6d0 | ||
|
|
649f2ad7b7 | ||
|
|
fade1cdf1d | ||
|
|
d261105a86 | ||
|
|
b3d3e8987b | ||
|
|
4e91f3777a | ||
|
|
5584240c7f | ||
|
|
7126a39092 |
6
Makefile
6
Makefile
@@ -44,13 +44,13 @@ YACC=bison -d -v -t
|
||||
|
||||
###########################################################################
|
||||
|
||||
CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
|
||||
CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
|
||||
llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
|
||||
util.cpp
|
||||
HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
|
||||
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
|
||||
builtins-sse4.ll builtins-sse4x2.ll
|
||||
builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
|
||||
BISON_SRC=parse.yy
|
||||
FLEX_SRC=lex.ll
|
||||
|
||||
|
||||
@@ -15,8 +15,8 @@ code.
|
||||
|
||||
ispc is an open source compiler under the BSD license; see the file
|
||||
LICENSE.txt. ispc supports Windows, Mac, and Linux, with both x86 and
|
||||
x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
|
||||
though support for AVX should be available soon.
|
||||
x86-64 targets. It currently supports the SSE2, SSE4, and AVX instruction
|
||||
sets.
|
||||
|
||||
For more information and examples, as well as a wiki and the bug database,
|
||||
see the ispc distribution site, http://ispc.github.com.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -31,36 +31,35 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef ISPC_CPUID_H
|
||||
#define ISPC_CPUID_H 1
|
||||
/** @file ast.cpp
|
||||
@brief
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// Provides a __cpuid() function with same signature as below
|
||||
#include <intrin.h>
|
||||
#else
|
||||
static void __cpuid(int info[4], int infoType) {
|
||||
__asm__ __volatile__ ("cpuid"
|
||||
: "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
: "0" (infoType));
|
||||
}
|
||||
#endif
|
||||
#include "ast.h"
|
||||
#include "decl.h"
|
||||
#include "func.h"
|
||||
#include "type.h"
|
||||
#include "sym.h"
|
||||
|
||||
inline bool CPUSupportsSSE2() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[3] & (1 << 26)) != 0;
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// ASTNode
|
||||
|
||||
ASTNode::~ASTNode() {
|
||||
}
|
||||
|
||||
inline bool CPUSupportsSSE4() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 19)) != 0;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// AST
|
||||
|
||||
void
|
||||
AST::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
|
||||
functions.push_back(new Function(ds, decl, code));
|
||||
}
|
||||
|
||||
inline bool CPUSupportsAVX() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 28)) != 0;
|
||||
|
||||
void
|
||||
AST::GenerateIR() {
|
||||
for (unsigned int i = 0; i < functions.size(); ++i)
|
||||
functions[i]->GenerateIR();
|
||||
}
|
||||
|
||||
#endif // ISPC_CPUID_H
|
||||
93
ast.h
Normal file
93
ast.h
Normal file
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file ast.h
|
||||
@brief
|
||||
*/
|
||||
|
||||
#ifndef ISPC_AST_H
|
||||
#define ISPC_AST_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <vector>
|
||||
|
||||
/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
|
||||
|
||||
This class defines a basic interface that all abstract syntax tree
|
||||
(AST) nodes must implement. The base classes for both expressions
|
||||
(Expr) and statements (Stmt) inherit from this class.
|
||||
*/
|
||||
class ASTNode {
|
||||
public:
|
||||
ASTNode(SourcePos p) : pos(p) { }
|
||||
virtual ~ASTNode();
|
||||
|
||||
/** The Optimize() method should perform any appropriate early-stage
|
||||
optimizations on the node (e.g. constant folding). The caller
|
||||
should use the returned ASTNode * in place of the original node.
|
||||
This method may return NULL if an error is encountered during
|
||||
optimization. */
|
||||
virtual ASTNode *Optimize() = 0;
|
||||
|
||||
/** Type checking should be performed by the node when this method is
|
||||
called. In the event of an error, a NULL value may be returned.
|
||||
As with ASTNode::Optimize(), the caller should store the returned
|
||||
pointer in place of the original ASTNode *. */
|
||||
virtual ASTNode *TypeCheck() = 0;
|
||||
|
||||
virtual int EstimateCost() const = 0;
|
||||
|
||||
/** All AST nodes must track the file position where they are
|
||||
defined. */
|
||||
const SourcePos pos;
|
||||
};
|
||||
|
||||
|
||||
/** Simple representation of the abstract syntax trees for all of the
|
||||
functions declared in a compilation unit.
|
||||
*/
|
||||
class AST {
|
||||
public:
|
||||
/** Add the AST for a function described by the given declaration
|
||||
information and source code. */
|
||||
void AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code);
|
||||
|
||||
/** Generate LLVM IR for all of the functions into the current
|
||||
module. */
|
||||
void GenerateIR();
|
||||
|
||||
private:
|
||||
std::vector<Function *> functions;
|
||||
};
|
||||
|
||||
#endif // ISPC_AST_H
|
||||
@@ -4,6 +4,8 @@ import sys
|
||||
import string
|
||||
import re
|
||||
import subprocess
|
||||
import platform
|
||||
import os
|
||||
|
||||
length=0
|
||||
|
||||
@@ -14,8 +16,12 @@ target = re.sub("\.ll$", "", target)
|
||||
target = re.sub("\.c$", "", target)
|
||||
target = re.sub("-", "_", target)
|
||||
|
||||
llvm_as="llvm-as"
|
||||
if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
|
||||
llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
|
||||
|
||||
try:
|
||||
as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||
as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||
except IOError:
|
||||
print >> sys.stderr, "Couldn't open " + src
|
||||
sys.exit(1)
|
||||
|
||||
16
buildall.bat
Normal file
16
buildall.bat
Normal file
@@ -0,0 +1,16 @@
|
||||
@echo off
|
||||
|
||||
REM If LLVM_INSTALL_DIR isn't set globally in your environment,
|
||||
REM it can be set here_
|
||||
set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
|
||||
|
||||
REM Both the LLVM binaries and python need to be in the path
|
||||
set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
|
||||
|
||||
msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
|
||||
|
||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
|
||||
msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
|
||||
30
builtins-c.c
30
builtins-c.c
@@ -51,8 +51,13 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#include <unistd.h>
|
||||
#endif // !_MSC_VER
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
typedef int Bool;
|
||||
@@ -139,3 +144,28 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
int __num_cores() {
|
||||
#ifdef _MSC_VER
|
||||
// This is quite a hack. Including all of windows.h to get this definition
|
||||
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||
// here. Presumably this struct declaration won't be changing in the future
|
||||
// anyway...
|
||||
struct SYSTEM_INFO {
|
||||
int pad0[2];
|
||||
void *pad1[2];
|
||||
int *pad2;
|
||||
int dwNumberOfProcessors;
|
||||
int pad3[3];
|
||||
};
|
||||
|
||||
struct SYSTEM_INFO sysInfo;
|
||||
extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
|
||||
GetSystemInfo(&sysInfo);
|
||||
return sysInfo.dwNumberOfProcessors;
|
||||
#else
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
#endif // !_MSC_VER
|
||||
}
|
||||
|
||||
123
builtins-dispatch.ll
Normal file
123
builtins-dispatch.ll
Normal file
@@ -0,0 +1,123 @@
|
||||
;; Copyright (c) 2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
;; This file defines various functions that are used when generating the
|
||||
;; the "dispatch" object/assembly file that has entrypoints for each
|
||||
;; exported function in a module that dispatch to the best available
|
||||
;; variant of that function that will run on the system's CPU.
|
||||
|
||||
;; Stores the best target ISA that the system on which we're actually
|
||||
;; running supports. -1 represents "uninitialized", otherwise this value
|
||||
;; should correspond to one of the enumerant values of Target::ISA from
|
||||
;; ispc.h.
|
||||
|
||||
@__system_best_isa = internal global i32 -1
|
||||
|
||||
declare void @abort() noreturn
|
||||
|
||||
;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
|
||||
;; following code... Specifically, __get_system_isa should return a value
|
||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
||||
;; most capable ISA that the curremt system can run.
|
||||
;;
|
||||
;; #ifdef _MSC_VER
|
||||
;; extern void __stdcall __cpuid(int info[4], int infoType);
|
||||
;; #else
|
||||
;; static void __cpuid(int info[4], int infoType) {
|
||||
;; __asm__ __volatile__ ("cpuid"
|
||||
;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (infoType));
|
||||
;; }
|
||||
;; #endif
|
||||
;;
|
||||
;; int32_t __get_system_isa() {
|
||||
;; int info[4];
|
||||
;; __cpuid(info, 1);
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0)
|
||||
;; return 2; // AVX
|
||||
;; else if ((info[2] & (1 << 19)) != 0)
|
||||
;; return 1; // SSE4
|
||||
;; else if ((info[3] & (1 << 26)) != 0)
|
||||
;; return 0; // SSE2
|
||||
;; else
|
||||
;; abort();
|
||||
;; }
|
||||
|
||||
%0 = type { i32, i32, i32, i32 }
|
||||
|
||||
define internal i32 @__get_system_isa() nounwind ssp {
|
||||
%1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%2 = extractvalue %0 %1, 2
|
||||
%3 = extractvalue %0 %1, 3
|
||||
%4 = and i32 %2, 268435456
|
||||
%5 = icmp eq i32 %4, 0
|
||||
br i1 %5, label %6, label %13
|
||||
|
||||
; <label>:6 ; preds = %0
|
||||
%7 = and i32 %2, 524288
|
||||
%8 = icmp eq i32 %7, 0
|
||||
br i1 %8, label %9, label %13
|
||||
|
||||
; <label>:9 ; preds = %6
|
||||
%10 = and i32 %3, 67108864
|
||||
%11 = icmp eq i32 %10, 0
|
||||
br i1 %11, label %12, label %13
|
||||
|
||||
; <label>:12 ; preds = %9
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
; <label>:13 ; preds = %9, %6, %0
|
||||
%.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
|
||||
ret i32 %.0
|
||||
}
|
||||
|
||||
|
||||
;; This function is called by each of the dispatch functions we generate;
|
||||
;; it sets @__system_best_isa if it is unset.
|
||||
|
||||
define internal void @__set_system_isa() {
|
||||
entry:
|
||||
%bi = load i32* @__system_best_isa
|
||||
%unset = icmp eq i32 %bi, -1
|
||||
br i1 %unset, label %set_system_isa, label %done
|
||||
|
||||
set_system_isa:
|
||||
%bival = call i32 @__get_system_isa()
|
||||
store i32 %bival, i32* @__system_best_isa
|
||||
ret void
|
||||
|
||||
done:
|
||||
ret void
|
||||
}
|
||||
|
||||
58
builtins.cpp
58
builtins.cpp
@@ -163,7 +163,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
|
||||
intAsUnsigned);
|
||||
if (eltType == NULL)
|
||||
return NULL;
|
||||
return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
|
||||
// FIXME: this needs to be fixed when arrays can have
|
||||
// over 4G elements...
|
||||
return new ReferenceType(new ArrayType(eltType, (int)at->getNumElements()),
|
||||
false);
|
||||
}
|
||||
}
|
||||
@@ -336,9 +338,9 @@ lCheckModuleIntrinsics(llvm::Module *module) {
|
||||
@param module Module to link the bitcode into
|
||||
@param symbolTable Symbol table to add definitions to
|
||||
*/
|
||||
static void
|
||||
lAddBitcode(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable) {
|
||||
void
|
||||
AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable) {
|
||||
std::string bcErr;
|
||||
llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
|
||||
llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
|
||||
@@ -365,7 +367,8 @@ lAddBitcode(const unsigned char *bitcode, int length,
|
||||
std::string(linkError);
|
||||
if (llvm::Linker::LinkModules(module, bcModule, &linkError))
|
||||
Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
|
||||
lAddModuleSymbols(module, symbolTable);
|
||||
if (symbolTable != NULL)
|
||||
lAddModuleSymbols(module, symbolTable);
|
||||
lCheckModuleIntrinsics(module);
|
||||
}
|
||||
}
|
||||
@@ -377,8 +380,8 @@ lAddBitcode(const unsigned char *bitcode, int length,
|
||||
static void
|
||||
lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
|
||||
pw->isStatic = true;
|
||||
Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
|
||||
SC_STATIC);
|
||||
pw->constValue = new ConstExpr(pw->type, val, SourcePos());
|
||||
LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
|
||||
llvm::Constant *linit = LLVMInt32(val);
|
||||
@@ -395,8 +398,7 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
SymbolTable *symbolTable) {
|
||||
std::vector<const Type *> args;
|
||||
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||
Symbol *sym = new Symbol(name, SourcePos(), ft);
|
||||
sym->isStatic = true;
|
||||
Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
|
||||
|
||||
llvm::Function *func = module->getFunction(name);
|
||||
assert(func != NULL); // it should be declared already...
|
||||
@@ -413,8 +415,7 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||
static void
|
||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||
AtomicType::VaryingConstInt32);
|
||||
pidx->isStatic = true;
|
||||
AtomicType::VaryingConstInt32, SC_STATIC);
|
||||
|
||||
int pi[ISPC_MAX_NVEC];
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
@@ -437,14 +438,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
if (g->target.is32bit) {
|
||||
extern unsigned char builtins_bitcode_c_32[];
|
||||
extern int builtins_bitcode_c_32_length;
|
||||
lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
else {
|
||||
extern unsigned char builtins_bitcode_c_64[];
|
||||
extern int builtins_bitcode_c_64_length;
|
||||
lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length,
|
||||
module, symbolTable);
|
||||
}
|
||||
|
||||
// Next, add the target's custom implementations of the various needed
|
||||
@@ -453,22 +454,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
case Target::SSE2:
|
||||
extern unsigned char builtins_bitcode_sse2[];
|
||||
extern int builtins_bitcode_sse2_length;
|
||||
lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
|
||||
symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case Target::SSE4:
|
||||
extern unsigned char builtins_bitcode_sse4[];
|
||||
extern int builtins_bitcode_sse4_length;
|
||||
extern unsigned char builtins_bitcode_sse4x2[];
|
||||
extern int builtins_bitcode_sse4x2_length;
|
||||
extern unsigned char builtins_bitcode_sse4_x2[];
|
||||
extern int builtins_bitcode_sse4_x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
@@ -479,14 +480,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx[];
|
||||
extern int builtins_bitcode_avx_length;
|
||||
lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module,
|
||||
symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx_x2[];
|
||||
extern int builtins_bitcode_avx_x2_length;
|
||||
lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
module, symbolTable);
|
||||
AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
@@ -522,11 +523,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
// definitions added. Disable emission of performance warnings for
|
||||
// now, since the user doesn't care about any of that in the stdlib
|
||||
// implementation...
|
||||
bool epf = g->emitPerfWarnings;
|
||||
g->emitPerfWarnings = false;
|
||||
extern char stdlib_code[];
|
||||
yy_scan_string(stdlib_code);
|
||||
yyparse();
|
||||
g->emitPerfWarnings = epf;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,4 +55,7 @@
|
||||
void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
|
||||
bool includeStdlib);
|
||||
|
||||
void AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
llvm::Module *module, SymbolTable *symbolTable = NULL);
|
||||
|
||||
#endif // ISPC_STDLIB_H
|
||||
|
||||
129
builtins.m4
129
builtins.m4
@@ -622,40 +622,6 @@ forloop(i, 1, eval($1-1), `
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; global_atomic
|
||||
;; Defines the implementation of a function that handles the mapping from
|
||||
;; an ispc atomic function to the underlying LLVM intrinsics. Specifically,
|
||||
;; the function handles loooping over the active lanes, calling the underlying
|
||||
;; scalar atomic intrinsic for each one, and assembling the vector result.
|
||||
;;
|
||||
;; Takes four parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
|
||||
;; (add, sub...)
|
||||
;; $3: return type of the LLVM atomic (e.g. i32)
|
||||
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||
|
||||
define(`global_atomic', `
|
||||
|
||||
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
|
||||
|
||||
define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%rptr = alloca <$1 x $3>
|
||||
%rptr32 = bitcast <$1 x $3> * %rptr to $3 *
|
||||
|
||||
per_lane($1, <$1 x i32> %mask, `
|
||||
%v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE
|
||||
%r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID)
|
||||
%rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE
|
||||
store $3 %r_LANE_ID, $3 * %rp_LANE_ID')
|
||||
|
||||
%r = load <$1 x $3> * %rptr
|
||||
ret <$1 x $3> %r
|
||||
}
|
||||
')
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; global_atomic_associative
|
||||
@@ -681,18 +647,20 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
||||
|
||||
define(`global_atomic_associative', `
|
||||
|
||||
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
|
||||
|
||||
;; note that the mask is expected to be of type $3, so the caller must ensure
|
||||
;; that for 64-bit types, the mask is cast to a signed int before being passed
|
||||
;; to this so that it is properly sign extended... (The code in stdlib.ispc
|
||||
;; does do this..)
|
||||
|
||||
define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
||||
<$1 x $3> %mask) nounwind alwaysinline {
|
||||
<$1 x i32> %m) nounwind alwaysinline {
|
||||
; first, for any lanes where the mask is off, compute a vector where those lanes
|
||||
; hold the identity value..
|
||||
|
||||
; for the bit tricks below, we need the mask to be sign extended to be
|
||||
; the size of the element type.
|
||||
ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>')
|
||||
ifelse($3, `i32', `
|
||||
; silly workaround to do %mask = %m, which is not possible directly..
|
||||
%maskmem = alloca <$1 x i32>
|
||||
store <$1 x i32> %m, <$1 x i32> * %maskmem
|
||||
%mask = load <$1 x i32> * %maskmem'
|
||||
)
|
||||
; zero out any lanes that are off
|
||||
%valoff = and <$1 x $3> %val, %mask
|
||||
|
||||
@@ -751,7 +719,7 @@ define(`global_atomic_uniform', `
|
||||
|
||||
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
|
||||
|
||||
define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val,
|
||||
define internal $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
|
||||
ret $3 %r
|
||||
@@ -764,9 +732,10 @@ define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val,
|
||||
;; $2: llvm type of the vector elements (e.g. i32)
|
||||
;; $3: ispc type of the elements (e.g. int32)
|
||||
|
||||
define(`global_swap', `
|
||||
declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
|
||||
declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)
|
||||
|
||||
declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
|
||||
define(`global_swap', `
|
||||
|
||||
define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
@@ -782,6 +751,12 @@ define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
|
||||
%r = load <$1 x $2> * %rptr
|
||||
ret <$1 x $2> %r
|
||||
}
|
||||
|
||||
define internal $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
|
||||
ret $2 %r
|
||||
}
|
||||
')
|
||||
|
||||
|
||||
@@ -811,6 +786,12 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $
|
||||
%r = load <$1 x $2> * %rptr
|
||||
ret <$1 x $2> %r
|
||||
}
|
||||
|
||||
define internal $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
|
||||
$2 %val, <$1 x i32> %mask) nounwind alwaysinline {
|
||||
%r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)
|
||||
ret $2 %r
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -853,10 +834,9 @@ define(`stdlib_core', `
|
||||
|
||||
declare i32 @__fast_masked_vload()
|
||||
|
||||
declare i8* @ISPCMalloc(i64, i32) nounwind
|
||||
declare i8* @ISPCFree(i8*) nounwind
|
||||
declare void @ISPCLaunch(i8*, i8*) nounwind
|
||||
declare void @ISPCSync() nounwind
|
||||
declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
|
||||
declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
|
||||
declare void @ISPCSync(i8*) nounwind
|
||||
declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
|
||||
|
||||
declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
|
||||
@@ -1228,6 +1208,11 @@ global_atomic_associative($1, sub, i32, int32, 0)
|
||||
global_atomic_associative($1, and, i32, int32, -1)
|
||||
global_atomic_associative($1, or, i32, int32, 0)
|
||||
global_atomic_associative($1, xor, i32, int32, 0)
|
||||
global_atomic_uniform($1, add, i32, int32)
|
||||
global_atomic_uniform($1, sub, i32, int32)
|
||||
global_atomic_uniform($1, and, i32, int32)
|
||||
global_atomic_uniform($1, or, i32, int32)
|
||||
global_atomic_uniform($1, xor, i32, int32)
|
||||
global_atomic_uniform($1, min, i32, int32)
|
||||
global_atomic_uniform($1, max, i32, int32)
|
||||
global_atomic_uniform($1, umin, i32, uint32)
|
||||
@@ -1238,6 +1223,11 @@ global_atomic_associative($1, sub, i64, int64, 0)
|
||||
global_atomic_associative($1, and, i64, int64, -1)
|
||||
global_atomic_associative($1, or, i64, int64, 0)
|
||||
global_atomic_associative($1, xor, i64, int64, 0)
|
||||
global_atomic_uniform($1, add, i64, int64)
|
||||
global_atomic_uniform($1, sub, i64, int64)
|
||||
global_atomic_uniform($1, and, i64, int64)
|
||||
global_atomic_uniform($1, or, i64, int64)
|
||||
global_atomic_uniform($1, xor, i64, int64)
|
||||
global_atomic_uniform($1, min, i64, int64)
|
||||
global_atomic_uniform($1, max, i64, int64)
|
||||
global_atomic_uniform($1, umin, i64, uint64)
|
||||
@@ -1264,6 +1254,24 @@ define internal <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x
|
||||
ret <$1 x double> %ret
|
||||
}
|
||||
|
||||
define internal float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%iptr = bitcast float * %ptr to i32 *
|
||||
%ival = bitcast float %val to i32
|
||||
%iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask)
|
||||
%ret = bitcast i32 %iret to float
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%iptr = bitcast double * %ptr to i64 *
|
||||
%ival = bitcast double %val to i64
|
||||
%iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask)
|
||||
%ret = bitcast i64 %iret to double
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
global_atomic_exchange($1, i32, int32)
|
||||
global_atomic_exchange($1, i64, int64)
|
||||
|
||||
@@ -1288,6 +1296,29 @@ define internal <$1 x double> @__atomic_compare_exchange_double_global(double *
|
||||
%ret = bitcast <$1 x i64> %iret to <$1 x double>
|
||||
ret <$1 x double> %ret
|
||||
}
|
||||
|
||||
define internal float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%iptr = bitcast float * %ptr to i32 *
|
||||
%icmp = bitcast float %cmp to i32
|
||||
%ival = bitcast float %val to i32
|
||||
%iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
|
||||
i32 %ival, <$1 x i32> %mask)
|
||||
%ret = bitcast i32 %iret to float
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define internal double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
|
||||
double %val, <$1 x i32> %mask) nounwind alwaysinline {
|
||||
%iptr = bitcast double * %ptr to i64 *
|
||||
%icmp = bitcast double %cmp to i64
|
||||
%ival = bitcast double %val to i64
|
||||
%iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
|
||||
i64 %ival, <$1 x i32> %mask)
|
||||
%ret = bitcast i64 %iret to double
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
')
|
||||
|
||||
|
||||
|
||||
141
ctx.cpp
141
ctx.cpp
@@ -37,6 +37,7 @@
|
||||
|
||||
#include "ctx.h"
|
||||
#include "util.h"
|
||||
#include "func.h"
|
||||
#include "llvmutil.h"
|
||||
#include "type.h"
|
||||
#include "stmt.h"
|
||||
@@ -123,19 +124,20 @@ CFInfo::GetLoop(bool isUniform, llvm::BasicBlock *breakTarget,
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *function,
|
||||
Symbol *funSym, SourcePos firstStmtPos) {
|
||||
FunctionEmitContext::FunctionEmitContext(Function *function, Symbol *funSym,
|
||||
llvm::Function *llvmFunction,
|
||||
SourcePos firstStmtPos) {
|
||||
const Type *rt = function->GetReturnType();
|
||||
|
||||
/* Create a new basic block to store all of the allocas */
|
||||
allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", function, 0);
|
||||
bblock = llvm::BasicBlock::Create(*g->ctx, "entry", function, 0);
|
||||
allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", llvmFunction, 0);
|
||||
bblock = llvm::BasicBlock::Create(*g->ctx, "entry", llvmFunction, 0);
|
||||
/* But jump from it immediately into the real entry block */
|
||||
llvm::BranchInst::Create(bblock, allocaBlock);
|
||||
|
||||
maskPtr = AllocaInst(LLVMTypes::MaskType, "mask_memory");
|
||||
StoreInst(LLVMMaskAllOn, maskPtr);
|
||||
|
||||
funcStartPos = funSym->pos;
|
||||
returnType = rt;
|
||||
maskPtr = NULL;
|
||||
entryMask = NULL;
|
||||
loopMask = NULL;
|
||||
breakLanesPtr = continueLanesPtr = NULL;
|
||||
@@ -144,6 +146,11 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
||||
returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
|
||||
StoreInst(LLVMMaskAllOff, returnedLanesPtr);
|
||||
|
||||
launchedTasks = false;
|
||||
launchGroupHandlePtr = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle");
|
||||
StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType),
|
||||
launchGroupHandlePtr);
|
||||
|
||||
if (!returnType || returnType == AtomicType::Void)
|
||||
returnValuePtr = NULL;
|
||||
else {
|
||||
@@ -160,31 +167,18 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
|
||||
llvm::DIType retType = rt->GetDIType(diFile);
|
||||
int flags = llvm::DIDescriptor::FlagPrototyped; // ??
|
||||
diFunction = m->diBuilder->createFunction(diFile, /* scope */
|
||||
function->getName(), // mangled
|
||||
llvmFunction->getName(), // mangled
|
||||
funSym->name,
|
||||
diFile,
|
||||
funcStartPos.first_line,
|
||||
retType,
|
||||
funSym->isStatic,
|
||||
funSym->storageClass == SC_STATIC,
|
||||
true, /* is definition */
|
||||
flags,
|
||||
g->opt.level > 0,
|
||||
function);
|
||||
llvmFunction);
|
||||
/* And start a scope representing the initial function scope */
|
||||
StartScope();
|
||||
}
|
||||
|
||||
launchedTasks = false;
|
||||
|
||||
// connect the funciton's mask memory to the __mask symbol
|
||||
Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
|
||||
assert(maskSymbol != NULL);
|
||||
maskSymbol->storagePtr = maskPtr;
|
||||
|
||||
// add debugging info for __mask, programIndex, ...
|
||||
if (m->diBuilder) {
|
||||
maskSymbol->pos = funcStartPos;
|
||||
EmitVariableDebugInfo(maskSymbol);
|
||||
|
||||
llvm::DIFile file = funcStartPos.GetDIFile();
|
||||
Symbol *programIndexSymbol = m->symbolTable->LookupVariable("programIndex");
|
||||
@@ -232,6 +226,12 @@ FunctionEmitContext::GetMask() {
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::SetMaskPointer(llvm::Value *p) {
|
||||
maskPtr = p;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::SetEntryMask(llvm::Value *value) {
|
||||
entryMask = value;
|
||||
@@ -759,7 +759,7 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
|
||||
FunctionEmitContext::SizeOf(LLVM_TYPE_CONST llvm::Type *ty) {
|
||||
// Emit code to compute the size of the given type using a GEP with a
|
||||
// NULL base pointer, indexing one element of the given type, and
|
||||
// casting the resulting 'pointer' to an int giving its size.
|
||||
@@ -776,24 +776,7 @@ FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
|
||||
#endif
|
||||
AddDebugPos(poffset);
|
||||
llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
|
||||
|
||||
// And given the size, call the malloc function
|
||||
llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
|
||||
assert(fmalloc != NULL);
|
||||
llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align),
|
||||
"raw_argmem");
|
||||
// Cast the void * back to the result pointer type
|
||||
return BitCastInst(mem, ptrType, "mem_bitcast");
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EmitFree(llvm::Value *ptr) {
|
||||
llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType,
|
||||
"argmemfree");
|
||||
llvm::Function *ffree = m->module->getFunction("ISPCFree");
|
||||
assert(ffree != NULL);
|
||||
CallInst(ffree, freeArg);
|
||||
return sizeOf;
|
||||
}
|
||||
|
||||
|
||||
@@ -1912,15 +1895,9 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::ReturnInst() {
|
||||
if (launchedTasks) {
|
||||
// Automatically add a sync call at the end of any function that
|
||||
// launched tasks
|
||||
SourcePos noPos;
|
||||
noPos.name = "__auto_sync";
|
||||
ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos);
|
||||
es->EmitCode(this);
|
||||
delete es;
|
||||
}
|
||||
if (launchedTasks)
|
||||
// Add a sync call at the end of any function that launched tasks
|
||||
SyncInst();
|
||||
|
||||
llvm::Instruction *rinst = NULL;
|
||||
if (returnValuePtr != NULL) {
|
||||
@@ -1943,7 +1920,8 @@ FunctionEmitContext::ReturnInst() {
|
||||
|
||||
llvm::Instruction *
|
||||
FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
||||
std::vector<llvm::Value *> &argVals) {
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount) {
|
||||
if (callee == NULL) {
|
||||
assert(m->errorCount > 0);
|
||||
return NULL;
|
||||
@@ -1960,29 +1938,15 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
||||
static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
|
||||
assert(argStructType->getNumElements() == argVals.size() + 1);
|
||||
|
||||
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||
assert(falloc != NULL);
|
||||
int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
|
||||
llvm::Value *argmem;
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
// Use malloc() to allocate storage on Windows, since the stack is
|
||||
// generally not big enough there to do enough allocations for lots of
|
||||
// tasks and then things crash horribly...
|
||||
argmem = EmitMalloc(argStructType, align);
|
||||
#else
|
||||
// Otherwise, use alloca for space for the task args, ** unless we're
|
||||
// compiling to AVX, in which case we use malloc after all **. (See
|
||||
// http://llvm.org/bugs/show_bug.cgi?id=10841 for details. There are
|
||||
// limitations in LLVM with respect to dynamic allocas of this sort
|
||||
// when the stack also has to be 32-byte aligned...).
|
||||
if (g->target.isa == Target::AVX)
|
||||
argmem = EmitMalloc(argStructType, align);
|
||||
else
|
||||
// KEY DETAIL: pass false to the call of
|
||||
// FunctionEmitContext::AllocaInst so that the alloca doesn't
|
||||
// happen just once at the top of the function, but happens each
|
||||
// time the enclosing basic block executes.
|
||||
argmem = AllocaInst(argStructType, "argmem", align, false);
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
|
||||
std::vector<llvm::Value *> allocArgs;
|
||||
allocArgs.push_back(launchGroupHandlePtr);
|
||||
allocArgs.push_back(SizeOf(argStructType));
|
||||
allocArgs.push_back(LLVMInt32(align));
|
||||
llvm::Value *voidmem = CallInst(falloc, allocArgs, "args_ptr");
|
||||
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||
|
||||
// Copy the values of the parameters into the appropriate place in
|
||||
// the argument block
|
||||
@@ -2004,5 +1968,32 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
|
||||
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||
assert(flaunch != NULL);
|
||||
return CallInst(flaunch, fptr, voidmem, "");
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(launchGroupHandlePtr);
|
||||
args.push_back(fptr);
|
||||
args.push_back(voidmem);
|
||||
args.push_back(launchCount);
|
||||
return CallInst(flaunch, args, "");
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::SyncInst() {
|
||||
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL);
|
||||
llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||
llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
|
||||
llvm::CmpInst::ICMP_NE,
|
||||
launchGroupHandle, nullPtrValue);
|
||||
llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
|
||||
llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
|
||||
BranchInst(bSync, bPostSync, nonNull);
|
||||
|
||||
SetCurrentBasicBlock(bSync);
|
||||
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||
if (fsync == NULL)
|
||||
FATAL("Couldn't find ISPCSync declaration?!");
|
||||
CallInst(fsync, launchGroupHandle, "");
|
||||
BranchInst(bPostSync);
|
||||
|
||||
SetCurrentBasicBlock(bPostSync);
|
||||
}
|
||||
|
||||
32
ctx.h
32
ctx.h
@@ -59,14 +59,15 @@ struct CFInfo;
|
||||
class FunctionEmitContext {
|
||||
public:
|
||||
/** Create a new FunctionEmitContext.
|
||||
@param returnType The return type of the function
|
||||
@param function LLVM function in the current module that corresponds
|
||||
@param function The Function object representing the function
|
||||
@param sym Symbol that corresponds to the function
|
||||
@param llvmFunction LLVM function in the current module that corresponds
|
||||
to the function
|
||||
@param funSym Symbol that corresponds to the function
|
||||
@param firstStmtPos Source file position of the first statement in the
|
||||
function
|
||||
*/
|
||||
FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
|
||||
FunctionEmitContext(Function *function, Symbol *funSym,
|
||||
llvm::Function *llvmFunction,
|
||||
SourcePos firstStmtPos);
|
||||
~FunctionEmitContext();
|
||||
|
||||
@@ -86,6 +87,8 @@ public:
|
||||
/** Returns the current mask value */
|
||||
llvm::Value *GetMask();
|
||||
|
||||
void SetMaskPointer(llvm::Value *p);
|
||||
|
||||
/** Provides the value of the mask at function entry */
|
||||
void SetEntryMask(llvm::Value *val);
|
||||
|
||||
@@ -210,15 +213,8 @@ public:
|
||||
i32. */
|
||||
llvm::Value *I1VecToBoolVec(llvm::Value *b);
|
||||
|
||||
/** Emit code to call the user-supplied ISPCMalloc function to
|
||||
allocate space for an object of thee given type. Returns the
|
||||
pointer value returned by the ISPCMalloc call. */
|
||||
llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0);
|
||||
|
||||
/** Emit code to call the user-supplied ISPCFree function, passing it
|
||||
the given pointer to storage previously allocated by an
|
||||
EmitMalloc() call. */
|
||||
void EmitFree(llvm::Value *ptr);
|
||||
/** Returns the size of the given type. */
|
||||
llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
|
||||
|
||||
/** If the user has asked to compile the program with instrumentation,
|
||||
this inserts a callback to the user-supplied instrumentation
|
||||
@@ -399,7 +395,10 @@ public:
|
||||
/** Launch an asynchronous task to run the given function, passing it
|
||||
he given argument values. */
|
||||
llvm::Instruction *LaunchInst(llvm::Function *callee,
|
||||
std::vector<llvm::Value *> &argVals);
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount);
|
||||
|
||||
void SyncInst();
|
||||
|
||||
llvm::Instruction *ReturnInst();
|
||||
/** @} */
|
||||
@@ -489,6 +488,11 @@ private:
|
||||
/** True if a 'launch' statement has been encountered in the function. */
|
||||
bool launchedTasks;
|
||||
|
||||
/** This is a pointer to a void * that is passed to the ISPCLaunch(),
|
||||
ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
|
||||
tasks launched from the current function. */
|
||||
llvm::Value *launchGroupHandlePtr;
|
||||
|
||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||
static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
|
||||
bool ifsInLoopAllUniform() const;
|
||||
|
||||
6
decl.cpp
6
decl.cpp
@@ -101,9 +101,7 @@ Declarator::AddArrayDimension(int size) {
|
||||
void
|
||||
Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
|
||||
sym->type = GetType(ds);
|
||||
|
||||
if (ds->storageClass == SC_STATIC)
|
||||
sym->isStatic = true;
|
||||
sym->storageClass = ds->storageClass;
|
||||
}
|
||||
|
||||
|
||||
@@ -237,7 +235,7 @@ Declarator::GetType(DeclSpecs *ds) const {
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, pos);
|
||||
Declarator *declarator = new Declarator(sym, sym->pos);
|
||||
sym->type = declarator->GetType(ds);
|
||||
sym->type = declarator->GetType(d->declSpecs);
|
||||
d->declarators.push_back(declarator);
|
||||
}
|
||||
else {
|
||||
|
||||
@@ -1,3 +1,55 @@
|
||||
=== v1.0.11 === (6 October 2011)
|
||||
|
||||
The main new feature in this release is support for generating code for
|
||||
multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
|
||||
select the best variant at execution time. For more information, see
|
||||
http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
|
||||
|
||||
All of the examples now take advantage of the support for multiple
|
||||
compilation targets; thus, if one has an AVX system, it's not necessary to
|
||||
recompile the examples to use the AVX target.
|
||||
|
||||
Performance of the built-in task system that is used in the examples has
|
||||
been improved.
|
||||
|
||||
Finally, the print() statement now works on OSX; it had been broken for the
|
||||
last few releases.
|
||||
|
||||
=== v1.0.10 === (30 September 2011)
|
||||
|
||||
This release features an extensive new example showing the application of
|
||||
ispc to a deferred shading algorithm for scenes with thousands of lights
|
||||
(examples/deferred). This is an implementation of the algorithm that Johan
|
||||
Andersson described at SIGGRAPH 2009 and was implemented by Andrew
|
||||
Lauritzen and Jefferson Montgomery. The basic idea is that a pre-rendered
|
||||
G-buffer is partitioned into tiles, and in each tile, the set of lights
|
||||
that contribute to the tile is computed. Then, the pixels in the tile are
|
||||
then shaded using those light sources. (See slides 19-29 of
|
||||
http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
|
||||
for more details on the algorithm.)
|
||||
|
||||
The mechanism for launching tasks from ispc code has been generalized to
|
||||
allow multiple tasks to be launched with a single launch call (see
|
||||
http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
|
||||
information.)
|
||||
|
||||
A few new functions have been added to the standard library: num_cores()
|
||||
returns the number of cores in the system's CPU, and variants of all of the
|
||||
atomic operators that take 'uniform' values as parameters have been added.
|
||||
|
||||
=== v1.0.9 === (26 September 2011)
|
||||
|
||||
The binary release of v1.0.9 is the first that supports AVX code
|
||||
generation. Two targets are provided: "avx", which runs with a
|
||||
programCount of 8, and "avx-x2" which runs 16 program instances
|
||||
simultaneously. (This binary is also built using the in-progress LLVM 3.0
|
||||
development libraries, while previous ones have been built with the
|
||||
released 2.9 version of LLVM.)
|
||||
|
||||
This release has no other significant changes beyond a number of small
|
||||
bugfixes (https://github.com/ispc/ispc/issues/100,
|
||||
https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
|
||||
|
||||
=== v1.0.8 === (19 September 2011)
|
||||
|
||||
A number of improvements have been made to handling of 'if' statements in
|
||||
|
||||
467
docs/ispc.txt
467
docs/ispc.txt
@@ -55,7 +55,8 @@ Contents:
|
||||
|
||||
* `Using The ISPC Compiler`_
|
||||
|
||||
+ `Command-line Options`_
|
||||
+ `Basic Command-line Options`_
|
||||
+ `Selecting The Compilation Target`_
|
||||
|
||||
* `The ISPC Language`_
|
||||
|
||||
@@ -80,7 +81,8 @@ Contents:
|
||||
+ `Program Instance Convergence`_
|
||||
+ `Data Races`_
|
||||
+ `Uniform Variables and Varying Control Flow`_
|
||||
+ `Task Parallelism in ISPC`_
|
||||
+ `Task Parallelism: Language Syntax`_
|
||||
+ `Task Parallelism: Runtime Requirements`_
|
||||
|
||||
* `The ISPC Standard Library`_
|
||||
|
||||
@@ -91,6 +93,7 @@ Contents:
|
||||
+ `Conversions To and From Half-Precision Floats`_
|
||||
+ `Atomic Operations and Memory Fences`_
|
||||
+ `Prefetches`_
|
||||
+ `System Information`_
|
||||
+ `Low-Level Bits`_
|
||||
|
||||
* `Interoperability with the Application`_
|
||||
@@ -115,6 +118,8 @@ Contents:
|
||||
+ `Using Scan Operations For Variable Output`_
|
||||
+ `Application-Supplied Execution Masks`_
|
||||
+ `Explicit Vector Programming With Uniform Short Vector Types`_
|
||||
+ `Choosing A Target Vector Width`_
|
||||
+ `Compiling With Support For Multiple Instruction Sets`_
|
||||
|
||||
* `Disclaimer and Legal Information`_
|
||||
|
||||
@@ -286,8 +291,8 @@ with application code, enter the following command
|
||||
compiling it. (This functionality can be disabled with the ``--nocpp``
|
||||
command-line argument.)
|
||||
|
||||
Command-line Options
|
||||
--------------------
|
||||
Basic Command-line Options
|
||||
--------------------------
|
||||
|
||||
The ``ispc`` executable can be run with ``--help`` to print a list of
|
||||
accepted command-line arguments. By default, the compiler compiles the
|
||||
@@ -295,56 +300,83 @@ provided program (and issues warnings and errors), but doesn't
|
||||
generate any output.
|
||||
|
||||
If the ``-o`` flag is given, it will generate an output file (a native
|
||||
object file by default). To generate a text assembly file, pass
|
||||
``--emit-asm``:
|
||||
object file by default).
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.s --emit-asm
|
||||
ispc foo.ispc -o foo.obj --emit-asm
|
||||
|
||||
To generate a text assembly file, pass ``--emit-asm``:
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.asm --emit-asm
|
||||
|
||||
To generate LLVM bitcode, use the ``--emit-llvm`` flag.
|
||||
|
||||
By default, an optimized x86-64 object file tuned for Intel® Core
|
||||
processors CPUs is built. You can use the ``--arch`` command line flag to
|
||||
specify a 32-bit x86 target:
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.obj --arch=x86
|
||||
|
||||
Optimizations can be turned off with ``-O0``:
|
||||
Optimizations are on by default; they can be turned off with ``-O0``:
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.obj -O0
|
||||
|
||||
On Mac\* and Linux\*, there is early support for generating debugging
|
||||
symbols; this is enabled with the ``-g`` command-line flag.
|
||||
On Mac\* and Linux\*, there is basic support for generating debugging
|
||||
symbols; this is enabled with the ``-g`` command-line flag. Using ``-g``
|
||||
causes optimizations to be disabled; to compile with debugging symbols and
|
||||
optimizaion, ``-O1`` should be provided as well as the ``-g`` flag.
|
||||
|
||||
The ``-h`` flag can also be used to direct ``ispc`` to generate a C/C++
|
||||
header file that includes C/C++ declarations of the C-callable ``ispc``
|
||||
functions and the types passed to it.
|
||||
|
||||
On Linux\* and Mac OS\*, ``-D`` can be used to specify definitions to be
|
||||
passed along to the C pre-prcessor, which runs over the program input
|
||||
before it's compiled. On Windows®, pre-processor definitions should be
|
||||
provided to the ``cl`` call.
|
||||
|
||||
By default, the compiler generates x86-64 Intel® SSE4 code. To generate
|
||||
32-bit code, you can use the ``--arch=x86`` command-line flag. To
|
||||
select Intel® SSE2, use ``--target=sse2``.
|
||||
|
||||
``ispc`` supports an alternative method for generating Intel® SSE4 code,
|
||||
where the program is "doubled up" and eight instances of it run in
|
||||
parallel, rather than just four. For workloads that don't require large
|
||||
numbers of registers, this method can lead to significantly more efficient
|
||||
execution thanks to greater instruction level parallelism. This option is
|
||||
selected with ``--target=sse4x2``.
|
||||
The ``-D`` option can be used to specify definitions to be passed along to
|
||||
the pre-processor, which runs over the program input before it's compiled.
|
||||
For example, including ``-DTEST=1`` defines the pre-processor symbol
|
||||
``TEST`` to have the value ``1`` when the program is compiled.
|
||||
|
||||
The compiler issues a number of performance warnings for code constructs
|
||||
that compile to relatively inefficient code. These warnings can be
|
||||
silenced with the ``--wno-perf`` flag (or by using ``--woff``, which turns
|
||||
off all warnings.)
|
||||
off all compiler warnings.)
|
||||
|
||||
Selecting The Compilation Target
|
||||
--------------------------------
|
||||
|
||||
There are three options that affect the compilation target: ``--arch``,
|
||||
which sets the target architecture, ``--cpu``, which sets the target CPU,
|
||||
and ``--target``, which sets the target instruction set.
|
||||
|
||||
By default, the ``ispc`` compiler generates code for the 64-bit x86-64
|
||||
architecture (i.e. ``--arch=x86-64`.) To compile to a 32-bit x86 target,
|
||||
supply ``-arch=x86`` on the command line:
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.obj --arch=x86
|
||||
|
||||
No other architectures are currently supported.
|
||||
|
||||
The target CPU determines both the default instruction set used as well as
|
||||
which CPU architecture the code is tuned for. ``ispc --help`` provides a
|
||||
list of a number of the supported CPUs. By default, the CPU type of the
|
||||
system on which you're running ``ispc`` is used to determine the target
|
||||
CPU.
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.obj --cpu=corei7-avx
|
||||
|
||||
Finally, ``--target`` selects between the SSE2, SSE4, and AVX instruction
|
||||
sets. (As general context, SSE2 was first introduced in processors that
|
||||
shipped in 2001, SSE4 was introduced in 2007, and processors with AVX
|
||||
were introduced in 2010. Consult your CPU's manual for specifics on which
|
||||
vector instruction set it supports.)
|
||||
|
||||
By default, the target instruction set is chosen based on which ones are
|
||||
supported by the system on which you're running ``ispc``. You can override
|
||||
this choice with the ``--target`` flag; for example, to select Intel® SSE2,
|
||||
use ``--target=sse2``. (As with the other options in this section, see the
|
||||
output of ``ispc --help`` for a full list of supported targets.)
|
||||
|
||||
|
||||
The ISPC Language
|
||||
@@ -837,8 +869,8 @@ by default. If a function is declared with a ``static`` qualifier, then it
|
||||
is only visible in the file in which it was declared.
|
||||
|
||||
Any function that can be launched with the ``launch`` construct in ``ispc``
|
||||
must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more
|
||||
discussion of launching tasks in ``ispc``.
|
||||
must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_
|
||||
for more discussion of launching tasks in ``ispc``.
|
||||
|
||||
Functions that are intended to be called from C/C++ application code must
|
||||
have the ``export`` qualifier. This causes them to have regular C linkage
|
||||
@@ -939,8 +971,9 @@ execution model is critical for writing efficient and correct programs in
|
||||
|
||||
``ispc`` supports both task parallelism to parallelize across multiple
|
||||
cores and SPMD parallelism to parallelize across the SIMD vector lanes on a
|
||||
single core. This section focuses on SPMD parallelism. See the section
|
||||
`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``.
|
||||
single core. This section focuses on SPMD parallelism. See the sections
|
||||
`Task Parallelism: Language Syntax`_ and `Task Parallelism: Runtime
|
||||
Requirements`_ for discussion of task parallelism in ``ispc``.
|
||||
|
||||
The SPMD-on-SIMD Execution Model
|
||||
--------------------------------
|
||||
@@ -1383,112 +1416,190 @@ be modified in the above code even if *none* of the program instances
|
||||
evaluated a true value for the test, given the ``ispc`` execution model.
|
||||
|
||||
|
||||
Task Parallelism in ISPC
|
||||
------------------------
|
||||
Task Parallelism: Language Syntax
|
||||
---------------------------------
|
||||
|
||||
One option for combining task-parallelism with ``ispc`` is to just use
|
||||
regular task parallelism in the C/C++ application code (be it through
|
||||
Intel® Cilk(tm), Intel® Thread Building Blocks or another task system,
|
||||
etc.), and for tasks to use ``ispc`` for SPMD parallelism across the vector
|
||||
lanes as appropriate. Alternatively, ``ispc`` also has some support for
|
||||
launching tasks from ``ispc`` code. The approach is similar to Intel®
|
||||
Cilk's task launch feature. (See the ``examples/mandelbrot_tasks`` example
|
||||
to see it used in a non-trivial example.)
|
||||
Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and
|
||||
for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as
|
||||
appropriate. Alternatively, ``ispc`` also has support for launching tasks
|
||||
from ``ispc`` code. The approach is similar to Intel® Cilk's task launch
|
||||
feature. (See the ``examples/mandelbrot_tasks`` example to see it used in
|
||||
a small example.)
|
||||
|
||||
Any function that is launched as a task must be declared with the ``task``
|
||||
qualifier:
|
||||
First, any function that is launched as a task must be declared with the
|
||||
``task`` qualifier:
|
||||
|
||||
::
|
||||
|
||||
task void func(uniform float a[], uniform int start) {
|
||||
....
|
||||
task void func(uniform float a[], uniform int index) {
|
||||
...
|
||||
a[index] = ....
|
||||
}
|
||||
|
||||
Tasks must return ``void``; a compile time error is issued if a
|
||||
non-``void`` task is defined.
|
||||
|
||||
Given a task, one can then write code that launches tasks as follows:
|
||||
Given a task definitions, there are two ways to write code that launches
|
||||
tasks, using the ``launch`` construct. First, one task can be launched at
|
||||
a time, with parameters passed to the task to help it determine what part
|
||||
of the overall computation it's responsible for:
|
||||
|
||||
::
|
||||
|
||||
for (uniform int i = 0; i < 100; ++i)
|
||||
launch < func(a, i); >
|
||||
launch < func(a, i) >;
|
||||
|
||||
Note the ``launch`` keyword and the brackets around the function call.
|
||||
This code launches 100 tasks, each of which presumably does some
|
||||
computation keyed off of given the value ``i``. In general, one should
|
||||
launch many more tasks than there are processors in the system to
|
||||
computation that is keyed off of given the value ``i``. In general, one
|
||||
should launch many more tasks than there are processors in the system to
|
||||
ensure good load-balancing, but not so many that the overhead of scheduling
|
||||
and running tasks dominates the computation.
|
||||
|
||||
Program execution continues asynchronously after task launch; thus, the
|
||||
function shouldn't access values being generated by the tasks without
|
||||
synchronization. A function uses a ``sync`` statement to wait for all
|
||||
launched tasks to finish:
|
||||
Alternatively, a number of tasks may be launched from a single ``launch``
|
||||
statement. We might instead write the above example with a single
|
||||
``launch`` like this:
|
||||
|
||||
::
|
||||
|
||||
for (uniform int i = 0; i < 100; ++i)
|
||||
launch < func(a, i); >
|
||||
launch[100] < func2(a) >;
|
||||
|
||||
Where an integer value (not necessarily a compile-time constant) is
|
||||
provided to the ``launch`` keyword in square brackets; this number of tasks
|
||||
will be enqueued to be run asynchronously. Within each of the tasks, two
|
||||
special built-in variables are available--``taskIndex``, and ``taskCount``.
|
||||
The first, ``taskIndex``, ranges from zero to one minus the number of tasks
|
||||
provided to ``launch``, and ``taskCount`` equals the number of launched
|
||||
taks. Thus, we might use ``taskIndex`` in the implementation of ``func2``
|
||||
to determine which array element to process.
|
||||
|
||||
::
|
||||
|
||||
task void func2(uniform float a[]) {
|
||||
...
|
||||
a[taskIndex] = ...
|
||||
}
|
||||
|
||||
Program execution continues asynchronously after a ``launch`` statement;
|
||||
thus, a function shouldn't access values being generated by the tasks it
|
||||
has launched within the function without synchronization. If results are
|
||||
needed before function return, a function can use a ``sync`` statement to
|
||||
wait for all launched tasks to finish:
|
||||
|
||||
::
|
||||
|
||||
launch[100] < func2(a) >;
|
||||
sync;
|
||||
// now safe to use computed values in a[]...
|
||||
|
||||
Alternatively, any function that launches tasks has an implicit ``sync``
|
||||
before it returns, so that functions that call a function that launches
|
||||
tasks don't have to worry about outstanding asynchronous computation.
|
||||
Alternatively, any function that launches tasks has an automatically-added
|
||||
``sync`` statement before it returns, so that functions that call a
|
||||
function that launches tasks don't have to worry about outstanding
|
||||
asynchronous computation from that function.
|
||||
|
||||
Inside functions with the ``task`` qualifier, two additional built-in
|
||||
variables are provided: ``threadIndex`` and ``threadCount``.
|
||||
``threadCount`` gives the total number of hardware threads that have been
|
||||
launched by the task system. ``threadIndex`` provides an index between
|
||||
zero and ``threadCount-1`` that gives a unique index that corresponds to
|
||||
the hardware thread that is executing the current task. The
|
||||
``threadIndex`` can be used for accessing data that is private to the
|
||||
current thread and thus doesn't require synchronization to access under
|
||||
parallel execution.
|
||||
variables are provided in addition to ``taskIndex`` and ``taskCount``:
|
||||
``threadIndex`` and ``threadCount``. ``threadCount`` gives the total
|
||||
number of hardware threads that have been launched by the task system.
|
||||
``threadIndex`` provides an index between zero and ``threadCount-1`` that
|
||||
gives a unique index that corresponds to the hardware thread that is
|
||||
executing the current task. The ``threadIndex`` can be used for accessing
|
||||
data that is private to the current thread and thus doesn't require
|
||||
synchronization to access under parallel execution.
|
||||
|
||||
Task Parallelism: Runtime Requirements
|
||||
--------------------------------------
|
||||
|
||||
If you use the task launch feature in ``ispc``, you must provide C/C++
|
||||
implementations of two functions and link them into your final executable
|
||||
file. Although these functions may be implemented in either language, they
|
||||
must have "C" linkage (i.e. their prototypes must be declared inside an
|
||||
``extern "C"`` block if they are defined in C++.)
|
||||
implementations of three specific functions that manage launching and
|
||||
synchronizing parallel tasks; these functions must be linked into your
|
||||
executable. Although these functions may be implemented in any
|
||||
language, they must have "C" linkage (i.e. their prototypes must be
|
||||
declared inside an ``extern "C"`` block if they are defined in C++.)
|
||||
|
||||
By using user-supplied versions of these functions, ``ispc`` programs can
|
||||
easily interoperate with software systems that have existing task systems
|
||||
for managing parallelism. If you're using ``ispc`` with a system that
|
||||
isn't otherwise multi-threaded and don't want to write custom
|
||||
implementations of them, you can use the implementations of these functions
|
||||
provided in the ``examples/tasksys.cpp`` file in the ``ispc``
|
||||
distributions.
|
||||
|
||||
If you are implementing your own task system, the remainder of this section
|
||||
discusses the requirements for these calls. You will also likely want to
|
||||
review the example task systems in ``examples/tasksys.cpp`` for reference.
|
||||
If you are not implmenting your own task system, you can skip reading the
|
||||
remainder of this section.
|
||||
|
||||
Here are the declarations of the three functions that must be provided to
|
||||
manage tasks in ``ispc``:
|
||||
|
||||
::
|
||||
|
||||
void ISPCLaunch(void *funcptr, void *data);
|
||||
void ISPCSync();
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
|
||||
void ISPCSync(void *handle);
|
||||
|
||||
On Windows, two additional functions must be provided to dynamically
|
||||
allocate and free memory to store the arguments passed to tasks. (On OSX
|
||||
and Linux, the stack provides memory for task arguments; on Windows, the
|
||||
stack is generally not large enough to do this for large numbers of tasks.)
|
||||
All three of these functions take an opaque handle (or a pointer to an
|
||||
opaque handle) as their first parameter. This handle allows the task
|
||||
system runtime to distinguish between calls to these functions from
|
||||
different functions in ``ispc`` code. In this way, the task system
|
||||
implementation can efficiently wait for completion on just the tasks
|
||||
launched from a single function.
|
||||
|
||||
The first time one of ``ISPCLaunch()`` or ``ISPCAlloc()`` is called in an
|
||||
``ispc`` functon, the ``void *`` pointed to by the ``handlePtr`` parameter
|
||||
will be ``NULL``. The implementations of these function should then
|
||||
initialize ``*handlePtr`` to a unique handle value of some sort. (For
|
||||
example, it might allocate a small structure to record which tasks were
|
||||
launched by the current function.) In subsequent calls to these functions
|
||||
in the emitted ``ispc`` code, the same value for ``handlePtr`` will be
|
||||
passed in, such that loading from ``*handlePtr`` will retrieve the value
|
||||
stored in the first call.
|
||||
|
||||
At function exit (or at an explicit ``sync`` statement), a call to
|
||||
``ISPCSync()`` will be generated if ``*handlePtr`` is non-``NULL``.
|
||||
Therefore, the handle value is passed directly to ``ISPCSync()``, rather
|
||||
than a pointer to it, as in the other functions.
|
||||
|
||||
The ``ISPCAlloc()`` function is used to allocate small blocks of memory to
|
||||
store parameters passed to tasks. It should return a pointer to memory
|
||||
with the given aize and alignment. Note that there is no explicit
|
||||
``ISPCFree()`` call; instead, all memory allocated within an ``ispc``
|
||||
function should be freed when ``ISPCSync()`` is called.
|
||||
|
||||
``ISPCLaunch()`` is called to launch to launch one or more asynchronous
|
||||
tasks. Each ``launch`` statement in ``ispc`` code causes a call to
|
||||
``ISPCLaunch()`` to be emitted in the generated code. The three parameters
|
||||
after the handle pointer to thie function are relatively straightforward;
|
||||
the ``void *f`` parameter holds a pointer to a function to call to run the
|
||||
work for this task, ``data`` holds a pointer to data to pass to this
|
||||
function, and ``count`` is the number of instances of this function to
|
||||
enqueue for asynchronous execution. (In other words, ``count`` corresponds
|
||||
to the value ``n`` in a multiple-task launch statement like ``launch[n]``.)
|
||||
|
||||
The signature of the provided function pointer ``f`` is
|
||||
|
||||
::
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount,
|
||||
int taskIndex, int taskCount)
|
||||
|
||||
These are called by the task launch code generated by the ``ispc``
|
||||
compiler; the first is called to launch to launch a task and the second is
|
||||
called to wait for, respectively. (Factoring them out in this way
|
||||
allows ``ispc`` to inter-operate with the application's task system, if
|
||||
any, rather than having a separate one of its own.) To run a particular
|
||||
task, the task system should cast the function pointer to a ``void (*)(void
|
||||
*, int, int)`` function pointer and then call it with the provided ``void
|
||||
*`` data and then an index for the current hardware thread and the total
|
||||
number of hardware threads the task system has launched--in other words:
|
||||
|
||||
::
|
||||
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
TaskFuncType tft = (TaskFuncType)(funcptr);
|
||||
tft(data, threadIndex, threadCount);
|
||||
|
||||
A number of sample task system implementations are provided with ``ispc``;
|
||||
see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and
|
||||
``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of
|
||||
the ``ispc`` distribution.
|
||||
When this function pointer is called by one of the hardware threads managed
|
||||
bythe task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
|
||||
be passed to it for its first parameter; ``threadCount`` gives the total
|
||||
number of hardware threads that have been spawned to run tasks and
|
||||
``threadIndex`` should be an integer index between zero and ``threadCount``
|
||||
uniquely identifying the hardware thread that is running the task. (These
|
||||
values can be used to index into thread-local storage.)
|
||||
|
||||
The value of ``taskCount`` should be the number of tasks launched in the
|
||||
``launch`` statement that caused the call to ``ISPCLaunch()`` and each of
|
||||
the calls to this function should be given a unique value of ``taskIndex``
|
||||
between zero and ``taskCount``, to distinguish which of the instances
|
||||
of the set of launched tasks is running.
|
||||
|
||||
The ISPC Standard Library
|
||||
=========================
|
||||
@@ -1830,12 +1941,19 @@ given value across all of the currently-executing vector lanes.
|
||||
|
||||
::
|
||||
|
||||
uniform float reduce_min(float a, float b)
|
||||
uniform int reduce_min(int a, int b)
|
||||
uniform unsigned int reduce_min(unsigned int a, unsigned int b)
|
||||
uniform float reduce_max(float a, float b)
|
||||
uniform int reduce_max(int a, int b)
|
||||
uniform unsigned int reduce_max(unsigned int a, unsigned int b)
|
||||
uniform float reduce_min(float a)
|
||||
uniform int32 reduce_min(int32 a)
|
||||
uniform unsigned int32 reduce_min(unsigned int32 a)
|
||||
uniform double reduce_min(double a)
|
||||
uniform int64 reduce_min(int64 a)
|
||||
uniform unsigned int64 reduce_min(unsigned int64 a)
|
||||
|
||||
uniform float reduce_max(float a)
|
||||
uniform int32 reduce_max(int32 a)
|
||||
uniform unsigned int32 reduce_max(unsigned int32 a)
|
||||
uniform double reduce_max(double a)
|
||||
uniform int64 reduce_max(int64 a)
|
||||
uniform unsigned int64 reduce_max(unsigned int64 a)
|
||||
|
||||
Finally, you can check to see if a particular value has the same value in
|
||||
all of the currently-running program instances:
|
||||
@@ -2033,12 +2151,12 @@ end.)
|
||||
|
||||
One thing to note is that that the value being added to here is a
|
||||
``uniform`` integer, while the increment amount and the return value are
|
||||
``varying``. In other words, the semantics are that each running program
|
||||
instance individually issues the atomic operation with its own ``delta``
|
||||
value and gets the previous value of ``val`` back in return. The atomics
|
||||
for the running program instances may be issued in arbitrary order; it's
|
||||
not guaranteed that they will be issued in ``programIndex`` order, for
|
||||
example.
|
||||
``varying``. In other words, the semantics of this call are that each
|
||||
running program instance individually issues the atomic operation with its
|
||||
own ``delta`` value and gets the previous value of ``val`` back in return.
|
||||
The atomics for the running program instances may be issued in arbitrary
|
||||
order; it's not guaranteed that they will be issued in ``programIndex``
|
||||
order, for example.
|
||||
|
||||
Here are the declarations of the ``int32`` variants of these functions.
|
||||
There are also ``int64`` equivalents as well as variants that take
|
||||
@@ -2056,17 +2174,44 @@ function can be used with ``float`` and ``double`` types as well.)
|
||||
int32 atomic_xor_global(reference uniform int32 val, int32 value)
|
||||
int32 atomic_swap_global(reference uniform int32 val, int32 newval)
|
||||
|
||||
There is also an atomic "compare and exchange" function; it atomically
|
||||
compares the value in "val" to "compare"--if they match, it assigns
|
||||
"newval" to "val". In either case, the old value of "val" is returned.
|
||||
(As with the other atomic operations, there are also ``unsigned`` and
|
||||
64-bit variants of this function. Furthermore, there are ``float`` and
|
||||
``double`` variants as well.)
|
||||
There are also variants of these functions that take ``uniform`` values for
|
||||
the operand and return a ``uniform`` result:
|
||||
|
||||
::
|
||||
|
||||
uniform int32 atomic_add_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_subtract_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_min_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_max_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_and_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_or_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_xor_global(reference uniform int32 val,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_swap_global(reference uniform int32 val,
|
||||
uniform int32 newval)
|
||||
|
||||
There are also an atomic swap and "compare and exchange" functions.
|
||||
Compare and exchange atomically compares the value in "val" to
|
||||
"compare"--if they match, it assigns "newval" to "val". In either case,
|
||||
the old value of "val" is returned. (As with the other atomic operations,
|
||||
there are also ``unsigned`` and 64-bit variants of this function.
|
||||
Furthermore, there are ``float`` and ``double`` variants as well.)
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_swap_global(reference uniform int32 val, int32 new)
|
||||
uniform int32 atomic_swap_global(reference uniform int32 val,
|
||||
uniform int32 new)
|
||||
int32 atomic_compare_exchange_global(reference uniform int32 val,
|
||||
int32 compare, int32 newval)
|
||||
uniform int32 atomic_compare_exchange_global(reference uniform int32 val,
|
||||
uniform int32 compare, uniform int32 newval)
|
||||
|
||||
``ispc`` also has a standard library routine that inserts a memory barrier
|
||||
into the code; it ensures that all memory reads and writes prior to be
|
||||
@@ -2115,6 +2260,20 @@ These functions are available for all of the basic types in the
|
||||
language--``int8``, ``int16``, ``int32``, ``float``, and so forth.
|
||||
|
||||
|
||||
System Information
|
||||
------------------
|
||||
|
||||
A routine is available to find the number of CPU cores available in the
|
||||
system:
|
||||
|
||||
::
|
||||
|
||||
int num_cores()
|
||||
|
||||
This value can be useful for adapting the granularity of parallel task
|
||||
decomposition depending on the number of processors in the system.
|
||||
|
||||
|
||||
Low-Level Bits
|
||||
--------------
|
||||
|
||||
@@ -2941,6 +3100,72 @@ Note that ``ispc`` doesn't currently support control-flow based on
|
||||
}
|
||||
|
||||
|
||||
Choosing A Target Vector Width
|
||||
------------------------------
|
||||
|
||||
By default, ``ispc`` compiles to the natural vector width of the target
|
||||
instruction set. For example, for SSE2 and SSE4, it compiles four-wide,
|
||||
and for AVX, it complies 8-wide. For some programs, higher performance may
|
||||
be seen if the program is compiled to a doubled vector width--8-wide for
|
||||
SSE and 16-wide for AVX.
|
||||
|
||||
For workloads that don't require many of registers, this method can lead to
|
||||
significantly more efficient execution thanks to greater instruction level
|
||||
parallelism and amortization of various overhead over more program
|
||||
instances. For other workloads, it may lead to a slowdown due to higher
|
||||
register pressure; trying both approaches for key kernels may be
|
||||
worthwhile.
|
||||
|
||||
This option is currently only available for the SSE4 and AVX targets, and
|
||||
is selected with the ``--target=sse4-x2`` and ``--target=avx-x2`` options,
|
||||
respectively.
|
||||
|
||||
Compiling With Support For Multiple Instruction Sets
|
||||
----------------------------------------------------
|
||||
|
||||
``ispc`` can also generate output that supports multiple target instruction
|
||||
sets, choosing the most appropriate one at runtime. For example, if you
|
||||
run the command:
|
||||
|
||||
::
|
||||
|
||||
ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
|
||||
|
||||
Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
|
||||
``foo_avx.o``, and ``foo.o``.[#]_ Link all of these into your executable, and
|
||||
when you call a function in ``foo.ispc`` from your application code,
|
||||
``ispc`` will determine which instruction sets are supported by the CPU the
|
||||
code is running on and will call the most appropraite version of the
|
||||
function available.
|
||||
|
||||
.. [#] Similarly, if you choose to generate assembly langauage output or
|
||||
LLVM bitcode output, multiple versions of those files will be created.
|
||||
|
||||
In general, the version of the function that runs will be the one in the
|
||||
most general instruction set that is supported by the system. If you only
|
||||
compile SSE2 and SSE4 variants and run on a system that supports AVX, for
|
||||
example, then the SSE4 variant will be executed. If the system doesn't
|
||||
is not able to run any of the available variants of the function (for
|
||||
example, trying to run a function that only has SSE4 and AVX variants on a
|
||||
system that only supports SSE2), then the standard library ``abort()``
|
||||
function will be called.
|
||||
|
||||
One subtlety is that all non-static global variables (if any) must have the
|
||||
same size and layout with all of the targets used. For example, if you
|
||||
have the global variables:
|
||||
|
||||
::
|
||||
|
||||
uniform int foo[2*programCount];
|
||||
int bar;
|
||||
|
||||
and compile to both SSE2 and AVX targets, both of these variables will have
|
||||
different sizes (the first due to program count having the value 4 for SSE2
|
||||
and 8 for AVX, and the second due to ``varying`` types having different
|
||||
numbers of elements with the two targets--essentially the same issue as the
|
||||
first.)
|
||||
|
||||
|
||||
Disclaimer and Legal Information
|
||||
================================
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.0.8
|
||||
PROJECT_NUMBER = 1.0.11
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
@@ -585,7 +585,6 @@ INPUT = builtins.h \
|
||||
ctx.h \
|
||||
decl.h \
|
||||
expr.h \
|
||||
gatherbuf.h \
|
||||
ispc.h \
|
||||
llvmutil.h \
|
||||
module.h \
|
||||
@@ -598,7 +597,6 @@ INPUT = builtins.h \
|
||||
ctx.cpp \
|
||||
decl.cpp \
|
||||
expr.cpp \
|
||||
gatherbuf.cpp \
|
||||
ispc.cpp \
|
||||
llvmutil.cpp \
|
||||
main.cpp \
|
||||
|
||||
@@ -13,6 +13,7 @@ against regular serial C++ implementations, printing out a comparison of
|
||||
the runtimes and the speedup delivered by ispc. It may be instructive to
|
||||
do a side-by-side diff of the C++ and ispc implementations of these
|
||||
algorithms to learn more about wirting ispc code.
|
||||
|
||||
|
||||
AOBench
|
||||
=======
|
||||
@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
|
||||
(xres x yres) image each time and measuring the computation time with both
|
||||
serial and ispc implementations.
|
||||
|
||||
|
||||
AOBench_Instrumented
|
||||
====================
|
||||
|
||||
@@ -40,12 +42,47 @@ is provided in the instrument.cpp file.
|
||||
*** Note: on Linux, this example currently hits an assertion in LLVM during
|
||||
*** compilation
|
||||
|
||||
|
||||
Deferred
|
||||
========
|
||||
|
||||
This example shows an extensive example of using ispc for efficient
|
||||
deferred shading of scenes with thousands of lights; it's an implementation
|
||||
of the algorithm that Johan Andersson described at SIGGRAPH 2009,
|
||||
implemented by Andrew Lauritzen and Jefferson Montgomery. The basic idea
|
||||
is that a pre-rendered G-buffer is partitioned into tiles, and in each
|
||||
tile, the set of lights that contribute to the tile is first computed.
|
||||
Then, the pixels in the tile are then shaded using just those light
|
||||
sources. (See slides 19-29 of
|
||||
http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
|
||||
for more details on the algorithm.)
|
||||
|
||||
This directory includes three implementations of the algorithm:
|
||||
|
||||
- An ispc implementation that first does a static partitioning of the
|
||||
screen into tiles to parallelize across the CPU cores. Within each tile
|
||||
ispc kernels provide highly efficient implementations of the light
|
||||
culling and shading calculations.
|
||||
- A "best practices" serial C++ implementation. This implementation does a
|
||||
dynamic partitioning of the screen, refining tiles with significant Z
|
||||
depth complexity (these tiles often have a large number of lights that
|
||||
affect them). Within each final tile, the pixels are shaded using
|
||||
regular C++ code.
|
||||
- If the Cilk extensions are available in your compiler, an ispc
|
||||
implementation that uses Cilk will also be built.
|
||||
(See http://software.intel.com/en-us/articles/intel-cilk-plus/). Like
|
||||
the "best practices" serial implementation, this version does dynamic
|
||||
tile partitioning for better load balancing and then uses ispc for the
|
||||
light culling and shading.
|
||||
|
||||
|
||||
Mandelbrot
|
||||
==========
|
||||
|
||||
Mandelbrot set generation. This example is extensively documented at the
|
||||
http://ispc.github.com/example.html page.
|
||||
|
||||
|
||||
Mandelbrot_tasks
|
||||
================
|
||||
|
||||
@@ -58,6 +95,7 @@ using tasks with ispc, no task system is mandated; the user is free to plug
|
||||
in any task system they want, for ease of interoperating with existing task
|
||||
systems.
|
||||
|
||||
|
||||
Noise
|
||||
=====
|
||||
|
||||
@@ -71,6 +109,7 @@ Options
|
||||
This program implements both the Black-Scholes and Binomial options pricing
|
||||
models in both ispc and regular serial C++ code.
|
||||
|
||||
|
||||
RT
|
||||
==
|
||||
|
||||
@@ -87,6 +126,7 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
|
||||
"Physically Based Rendering" book for more about the basic algorithmic
|
||||
details.
|
||||
|
||||
|
||||
Simple
|
||||
======
|
||||
|
||||
@@ -94,6 +134,7 @@ This is a simple "hello world" type program that shows a ~10 line
|
||||
application program calling out to a ~5 line ispc program to do a simple
|
||||
computation.
|
||||
|
||||
|
||||
Volume
|
||||
======
|
||||
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasks_pthreads.cpp
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=../tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
|
||||
|
||||
ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
|
||||
objs/ao_ispc_avx.o
|
||||
OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)
|
||||
|
||||
default: ao
|
||||
|
||||
@@ -26,8 +24,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ ao
|
||||
|
||||
ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
ao: dirs $(OBJS) $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -37,5 +35,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/ao.o: objs/ao_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -55,7 +55,6 @@
|
||||
using namespace ispc;
|
||||
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
@@ -105,38 +104,6 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 4) {
|
||||
@@ -151,8 +118,6 @@ int main(int argc, char **argv)
|
||||
height = atoi (argv[3]);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
@@ -323,16 +323,13 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
}
|
||||
|
||||
|
||||
static void task ao_task(uniform int y0, uniform int y1, uniform int width,
|
||||
uniform int height, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
ao_scanlines(y0, y1, width, height, nsubsamples, image);
|
||||
static void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[]) {
|
||||
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
uniform int dy = 1;
|
||||
for (uniform int y = 0; y < h; y += dy)
|
||||
launch < ao_task(y, y+dy, w, h, nsubsamples, image) >;
|
||||
launch[h] < ao_task(w, h, nsubsamples, image) >;
|
||||
}
|
||||
|
||||
30
examples/aobench/aobench.vcxproj
Executable file → Normal file
30
examples/aobench/aobench.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -21,23 +21,23 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ao.cpp" />
|
||||
<ClCompile Include="ao_serial.cpp" />
|
||||
<ClCompile Include="../tasks_concrt.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
@@ -86,15 +86,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -103,6 +107,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -118,6 +123,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -135,6 +141,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -153,6 +160,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -165,4 +173,4 @@
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
@@ -2,7 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -g3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --instrument --arch=x86-64
|
||||
ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
|
||||
|
||||
default: ao
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
@@ -56,7 +55,6 @@ using namespace ispc;
|
||||
|
||||
#include "instrument.h"
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
@@ -104,37 +102,6 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
@@ -150,8 +117,6 @@ int main(int argc, char **argv)
|
||||
height = atoi (argv[3]);
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
34
examples/aobench_instrumented/aobench_instrumented.vcxproj
Executable file → Normal file
34
examples/aobench_instrumented/aobench_instrumented.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -25,18 +25,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="ao.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --instrument --target=sse2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --instrument --target=sse2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --instrument --target=sse2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --instrument --target=sse2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
@@ -85,15 +85,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -101,7 +105,8 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -114,7 +119,8 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -129,7 +135,8 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -146,7 +153,8 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
|
||||
38
examples/deferred/Makefile
Normal file
38
examples/deferred/Makefile
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
|
||||
|
||||
OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
|
||||
objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
|
||||
objs/dynamic_c.o objs/dynamic_cilk.o
|
||||
|
||||
default: deferred_shading
|
||||
|
||||
.PHONY: dirs clean
|
||||
.PRECIOUS: objs/kernels_ispc.h
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ deferred_shading
|
||||
|
||||
deferred_shading: dirs $(OBJS) $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
209
examples/deferred/common.cpp
Normal file
209
examples/deferred/common.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "../timing.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Framebuffer::Framebuffer(int width, int height) {
|
||||
nPixels = width*height;
|
||||
r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
}
|
||||
|
||||
|
||||
Framebuffer::~Framebuffer() {
|
||||
lAlignedFree(r);
|
||||
lAlignedFree(g);
|
||||
lAlignedFree(b);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Framebuffer::clear() {
|
||||
memset(r, 0, nPixels);
|
||||
memset(g, 0, nPixels);
|
||||
memset(b, 0, nPixels);
|
||||
}
|
||||
|
||||
|
||||
InputData *
|
||||
CreateInputDataFromFile(const char *path) {
|
||||
FILE *in = fopen(path, "rb");
|
||||
if (!in) return 0;
|
||||
|
||||
InputData *input = new InputData;
|
||||
|
||||
// Load header
|
||||
if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Load data chunk and update pointers
|
||||
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
||||
ALIGNMENT_BYTES);
|
||||
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
input->arrays.zBuffer =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
|
||||
input->arrays.normalEncoded_x =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
|
||||
input->arrays.normalEncoded_y =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
|
||||
input->arrays.specularAmount =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
|
||||
input->arrays.specularPower =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
|
||||
input->arrays.albedo_x =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
|
||||
input->arrays.albedo_y =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
|
||||
input->arrays.albedo_z =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
|
||||
input->arrays.lightPositionView_x =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
|
||||
input->arrays.lightPositionView_y =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
|
||||
input->arrays.lightPositionView_z =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
|
||||
input->arrays.lightAttenuationBegin =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
|
||||
input->arrays.lightColor_x =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
|
||||
input->arrays.lightColor_y =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
|
||||
input->arrays.lightColor_z =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
|
||||
input->arrays.lightAttenuationEnd =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
|
||||
|
||||
fclose(in);
|
||||
return input;
|
||||
}
|
||||
|
||||
|
||||
void DeleteInputData(InputData *input) {
|
||||
lAlignedFree(input->chunk);
|
||||
}
|
||||
|
||||
|
||||
void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer) {
|
||||
// Deswizzle and copy to RGBA output
|
||||
// Doesn't need to be fast... only happens once
|
||||
size_t imageBytes = 3 * input->header.framebufferWidth *
|
||||
input->header.framebufferHeight;
|
||||
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
|
||||
memset(framebufferAOS, 0, imageBytes);
|
||||
|
||||
for (int i = 0; i < input->header.framebufferWidth *
|
||||
input->header.framebufferHeight; ++i) {
|
||||
framebufferAOS[3 * i + 0] = framebuffer.r[i];
|
||||
framebufferAOS[3 * i + 1] = framebuffer.g[i];
|
||||
framebufferAOS[3 * i + 2] = framebuffer.b[i];
|
||||
}
|
||||
|
||||
// Write out simple PPM file
|
||||
FILE *out = fopen(filename, "wb");
|
||||
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
fwrite(framebufferAOS, imageBytes, 1, out);
|
||||
|
||||
lAlignedFree(framebufferAOS);
|
||||
}
|
||||
BIN
examples/deferred/data/pp1280x720.bin
Normal file
BIN
examples/deferred/data/pp1280x720.bin
Normal file
Binary file not shown.
BIN
examples/deferred/data/pp1920x1200.bin
Normal file
BIN
examples/deferred/data/pp1920x1200.bin
Normal file
Binary file not shown.
108
examples/deferred/deferred.h
Normal file
108
examples/deferred/deferred.h
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DEFERRED_H
|
||||
#define DEFERRED_H
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
#define MAX_LIGHTS 1024
|
||||
|
||||
enum InputDataArraysEnum {
|
||||
idaZBuffer = 0,
|
||||
idaNormalEncoded_x,
|
||||
idaNormalEncoded_y,
|
||||
idaSpecularAmount,
|
||||
idaSpecularPower,
|
||||
idaAlbedo_x,
|
||||
idaAlbedo_y,
|
||||
idaAlbedo_z,
|
||||
idaLightPositionView_x,
|
||||
idaLightPositionView_y,
|
||||
idaLightPositionView_z,
|
||||
idaLightAttenuationBegin,
|
||||
idaLightColor_x,
|
||||
idaLightColor_y,
|
||||
idaLightColor_z,
|
||||
idaLightAttenuationEnd,
|
||||
|
||||
idaNum
|
||||
};
|
||||
|
||||
#ifndef ISPC
|
||||
|
||||
#include <stdint.h>
|
||||
#include "kernels_ispc.h"
|
||||
|
||||
#define ALIGNMENT_BYTES 64
|
||||
|
||||
#define MAX_LIGHTS 1024
|
||||
|
||||
#define VISUALIZE_LIGHT_COUNT 0
|
||||
|
||||
struct InputData
|
||||
{
|
||||
ispc::InputHeader header;
|
||||
ispc::InputDataArrays arrays;
|
||||
uint8_t *chunk;
|
||||
};
|
||||
|
||||
|
||||
struct Framebuffer {
|
||||
Framebuffer(int width, int height);
|
||||
~Framebuffer();
|
||||
|
||||
void clear();
|
||||
|
||||
uint8_t *r, *g, *b;
|
||||
|
||||
private:
|
||||
int nPixels;
|
||||
Framebuffer(const Framebuffer &);
|
||||
Framebuffer &operator=(const Framebuffer *);
|
||||
};
|
||||
|
||||
|
||||
InputData *CreateInputDataFromFile(const char *path);
|
||||
void DeleteInputData(InputData *input);
|
||||
void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer);
|
||||
void InitDynamicC(InputData *input);
|
||||
void InitDynamicCilk(InputData *input);
|
||||
void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
|
||||
void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
|
||||
|
||||
#endif // !ISPC
|
||||
|
||||
#endif // DEFERRED_H
|
||||
178
examples/deferred/deferred_shading.vcxproj
Executable file
178
examples/deferred/deferred_shading.vcxproj
Executable file
@@ -0,0 +1,178 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>mandelbrot</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="common.cpp" />
|
||||
<ClCompile Include="dynamic_c.cpp" />
|
||||
<ClCompile Include="dynamic_cilk.cpp" />
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="kernels.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
870
examples/deferred/dynamic_c.cpp
Normal file
870
examples/deferred/dynamic_c.cpp
Normal file
@@ -0,0 +1,870 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
|
||||
|
||||
#define DYNAMIC_TREE_LEVELS 5
|
||||
// If this is set to 1 then the result will be identical to the static version
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBounds(int tileStartX, int tileEndX,
|
||||
int tileStartY, int tileEndY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float *minZ, float *maxZ)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (int y = tileStartY; y < tileEndY; ++y) {
|
||||
for (int x = tileStartX; x < tileEndX; ++x) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[(y * gBufferWidth + x)];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = std::min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
*minZ = laneMinZ;
|
||||
*maxZ = laneMaxZ;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
||||
int numTilesX, int numTilesY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float minZArray[],
|
||||
float maxZArray[])
|
||||
{
|
||||
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
float minZ, maxZ;
|
||||
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
|
||||
cameraNear, cameraFar, &minZ, &maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class MinMaxZTree
|
||||
{
|
||||
public:
|
||||
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||
// Levels must be small enough that neither dimension goes below one tile
|
||||
MinMaxZTree(
|
||||
int tileWidth, int tileHeight, int levels,
|
||||
int gBufferWidth, int gBufferHeight)
|
||||
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
int x = NumTilesX(i);
|
||||
int y = NumTilesY(i);
|
||||
assert(x > 0);
|
||||
assert(y > 0);
|
||||
// NOTE: If the following two asserts fire it probably means that
|
||||
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar)
|
||||
{
|
||||
for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||
ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||
zBuffer, gBufferPitchInElements,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||
}
|
||||
|
||||
// Generate other levels
|
||||
for (int level = 1; level < mLevels; ++level) {
|
||||
int destTilesX = NumTilesX(level);
|
||||
int destTilesY = NumTilesY(level);
|
||||
int srcLevel = level - 1;
|
||||
int srcTilesX = NumTilesX(srcLevel);
|
||||
int srcTilesY = NumTilesY(srcLevel);
|
||||
for (int y = 0; y < destTilesY; ++y) {
|
||||
for (int x = 0; x < destTilesX; ++x) {
|
||||
int srcX = x << 1;
|
||||
int srcY = y << 1;
|
||||
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||
// TODO: SSE branchless min/max is probably better...
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
}
|
||||
}
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
}
|
||||
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MinMaxZTree() {
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
lAlignedFree(mMinZArrays[i]);
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
|
||||
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||
// TODO: Verify this...
|
||||
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||
|
||||
float MinZ(int level, int tileX, int tileY) const {
|
||||
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
float MaxZ(int level, int tileX, int tileY) const {
|
||||
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
|
||||
private:
|
||||
int mTileWidth;
|
||||
int mTileHeight;
|
||||
int mLevels;
|
||||
int mNumTilesX;
|
||||
int mNumTilesY;
|
||||
|
||||
// One array for each "level" in the tree
|
||||
float **mMinZArrays;
|
||||
float **mMaxZArrays;
|
||||
};
|
||||
|
||||
static MinMaxZTree *gMinMaxZTree = 0;
|
||||
|
||||
void InitDynamicC(InputData *input) {
|
||||
gMinMaxZTree =
|
||||
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
/* We're going to split a tile into 4 sub-tiles. This function
|
||||
reclassifies the tile's lights with respect to the sub-tiles. */
|
||||
static void
|
||||
SplitTileMinMax(
|
||||
int tileMidX, int tileMidY,
|
||||
// Subtile data (00, 10, 01, 11)
|
||||
float subtileMinZ[],
|
||||
float subtileMaxZ[],
|
||||
// G-buffer data
|
||||
int gBufferWidth, int gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int lightIndices[],
|
||||
int numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
int subtileIndices[],
|
||||
int subtileIndicesPitch,
|
||||
int subtileNumLights[]
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
// Normalize
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
// Initialize
|
||||
int subtileLightOffset[4];
|
||||
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
for (int i = 0; i < numLights; ++i) {
|
||||
int lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again against subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
if (fabsf(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||
}
|
||||
if (fabsf(dy) > light_attenuationEnd) {
|
||||
bool positiveY = dy > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||
}
|
||||
|
||||
if (inFrustum[0])
|
||||
subtileIndices[subtileLightOffset[0]++] = lightIndex;
|
||||
if (inFrustum[1])
|
||||
subtileIndices[subtileLightOffset[1]++] = lightIndex;
|
||||
if (inFrustum[2])
|
||||
subtileIndices[subtileLightOffset[2]++] = lightIndex;
|
||||
if (inFrustum[3])
|
||||
subtileIndices[subtileLightOffset[3]++] = lightIndex;
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||
float n = 1.f / sqrtf(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
Unorm8ToFloat32(uint8_t u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline uint8_t
|
||||
Float32ToUnorm8(float f) {
|
||||
return (uint8_t)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
half_to_float_fast(uint16_t h) {
|
||||
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
||||
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
||||
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
uint32_t xs = ((uint32_t) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
uint32_t xe = (uint32_t) (xes << 23);
|
||||
// Mantissa
|
||||
uint32_t xm = ((uint32_t) hm) << 13;
|
||||
|
||||
uint32_t bits = (xs | xe | xm);
|
||||
float *fp = reinterpret_cast<float *>(&bits);
|
||||
return *fp;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeTileC(
|
||||
int32_t tileStartX, int32_t tileEndX,
|
||||
int32_t tileStartY, int32_t tileEndY,
|
||||
int32_t gBufferWidth, int32_t gBufferHeight,
|
||||
const ispc::InputDataArrays &inputData,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
// Light list
|
||||
int32_t tileLightIndices[],
|
||||
int32_t tileNumLights,
|
||||
// UI
|
||||
bool visualizeLightCount,
|
||||
// Output
|
||||
uint8_t framebuffer_r[],
|
||||
uint8_t framebuffer_g[],
|
||||
uint8_t framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t framebufferIndex = (y * gBufferWidth + x);
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrtf(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
int32_t lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrtf(distanceToLight2);
|
||||
|
||||
float distanceToLightRcp = 1.f / distanceToLight;
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
if (NdotL > 0.0f) {
|
||||
float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = std::min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = std::max(NdotH, 0.0f);
|
||||
|
||||
float specular = powf(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
|
||||
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
|
||||
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ShadeTileC(startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
--level;
|
||||
tileX <<= 1;
|
||||
tileY <<= 1;
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
|
||||
// Work out splitting coords
|
||||
int midX = (tileX + 1) * width;
|
||||
int midY = (tileY + 1) * height;
|
||||
|
||||
// Read subtile min/max data
|
||||
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||
// framebuffer sizes.
|
||||
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
if (rightTileExists) {
|
||||
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||
if (bottomTileExists) {
|
||||
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||
}
|
||||
}
|
||||
if (bottomTileExists) {
|
||||
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||
}
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
// Recurse into subtiles
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
subtileLightIndices[1], subtileNumLights[1],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||
subtileLightIndices[2], subtileNumLights[2],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||
subtileLightIndices[3], subtileNumLights[3],
|
||||
framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
IntersectLightsWithTileMinMax(
|
||||
int tileStartX, int tileEndX,
|
||||
int tileStartY, int tileEndY,
|
||||
// Tile data
|
||||
float minZ,
|
||||
float maxZ,
|
||||
// G-buffer data
|
||||
int gBufferWidth, int gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Output
|
||||
int tileLightIndices[]
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[4];
|
||||
float frustumPlanes_z[4];
|
||||
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
|
||||
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
|
||||
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
|
||||
frustumPlanes_xy_v[i] *= norm;
|
||||
frustumPlanes_z_v[i] *= norm;
|
||||
|
||||
frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
|
||||
frustumPlanes_z[i] = frustumPlanes_z_v[i];
|
||||
}
|
||||
|
||||
int tileNumLights = 0;
|
||||
|
||||
for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
if (!inFrustum)
|
||||
continue;
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
if (inFrustum)
|
||||
tileLightIndices[tileNumLights++] = lightIndex;
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// Get Z min/max for this tile
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int numLights = IntersectLightsWithTileMinMax(
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||
int rootTiles = rootTilesX * rootTilesY;
|
||||
for (int g = 0; g < rootTiles; ++g) {
|
||||
uint32_t tileY = g / rootTilesX;
|
||||
uint32_t tileX = g % rootTilesX;
|
||||
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||
}
|
||||
}
|
||||
398
examples/deferred/dynamic_cilk.cpp
Normal file
398
examples/deferred/dynamic_cilk.cpp
Normal file
@@ -0,0 +1,398 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __cilk
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
|
||||
|
||||
#define DYNAMIC_TREE_LEVELS 5
|
||||
// If this is set to 1 then the result will be identical to the static version
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
class MinMaxZTreeCilk
|
||||
{
|
||||
public:
|
||||
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||
// Levels must be small enough that neither dimension goes below one tile
|
||||
MinMaxZTreeCilk(
|
||||
int tileWidth, int tileHeight, int levels,
|
||||
int gBufferWidth, int gBufferHeight)
|
||||
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
int x = NumTilesX(i);
|
||||
int y = NumTilesY(i);
|
||||
assert(x > 0);
|
||||
assert(y > 0);
|
||||
// NOTE: If the following two asserts fire it probably means that
|
||||
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar)
|
||||
{
|
||||
// Compute level 0 in parallel. Outer loops is here since we use Cilk
|
||||
_Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||
ispc::ComputeZBoundsRow(tileY,
|
||||
mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||
zBuffer, gBufferPitchInElements,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||
}
|
||||
|
||||
// Generate other levels
|
||||
// NOTE: We currently don't use ispc here since it's sort of an
|
||||
// awkward gather-based reduction Using SSE odd pack/unpack
|
||||
// instructions might actually work here when we need to optimize
|
||||
for (int level = 1; level < mLevels; ++level) {
|
||||
int destTilesX = NumTilesX(level);
|
||||
int destTilesY = NumTilesY(level);
|
||||
int srcLevel = level - 1;
|
||||
int srcTilesX = NumTilesX(srcLevel);
|
||||
int srcTilesY = NumTilesY(srcLevel);
|
||||
_Cilk_for (int y = 0; y < destTilesY; ++y) {
|
||||
for (int x = 0; x < destTilesX; ++x) {
|
||||
int srcX = x << 1;
|
||||
int srcY = y << 1;
|
||||
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||
// TODO: SSE branchless min/max is probably better...
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
}
|
||||
}
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
}
|
||||
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MinMaxZTreeCilk() {
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
lAlignedFree(mMinZArrays[i]);
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
|
||||
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||
// TODO: Verify this...
|
||||
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||
|
||||
float MinZ(int level, int tileX, int tileY) const {
|
||||
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
float MaxZ(int level, int tileX, int tileY) const {
|
||||
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
|
||||
private:
|
||||
int mTileWidth;
|
||||
int mTileHeight;
|
||||
int mLevels;
|
||||
int mNumTilesX;
|
||||
int mNumTilesY;
|
||||
|
||||
// One array for each "level" in the tree
|
||||
float **mMinZArrays;
|
||||
float **mMaxZArrays;
|
||||
};
|
||||
|
||||
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
|
||||
|
||||
void InitDynamicCilk(InputData *input) {
|
||||
gMinMaxZTreeCilk =
|
||||
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ispc::ShadeTile(
|
||||
startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
&input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
--level;
|
||||
tileX <<= 1;
|
||||
tileY <<= 1;
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
|
||||
// Work out splitting coords
|
||||
int midX = (tileX + 1) * width;
|
||||
int midY = (tileY + 1) * height;
|
||||
|
||||
// Read subtile min/max data
|
||||
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||
// framebuffer sizes.
|
||||
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
if (rightTileExists) {
|
||||
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||
if (bottomTileExists) {
|
||||
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||
}
|
||||
}
|
||||
if (bottomTileExists) {
|
||||
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||
}
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
// Recurse into subtiles
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
subtileLightIndices[1], subtileNumLights[1],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||
subtileLightIndices[2], subtileNumLights[2],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||
subtileLightIndices[3], subtileNumLights[3],
|
||||
framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// Get Z min/max for this tile
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int numLights = ispc::IntersectLightsWithTileMinMax(
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
// Launch the "root" tiles. Ideally these should at least fill the
|
||||
// machine... at the moment we have a static number of "levels" to the
|
||||
// mip tree but it might make sense to compute it based on the width of
|
||||
// the machine.
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||
int rootTiles = rootTilesX * rootTilesY;
|
||||
_Cilk_for (int g = 0; g < rootTiles; ++g) {
|
||||
uint32_t tileY = g / rootTilesX;
|
||||
uint32_t tileX = g % rootTilesX;
|
||||
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __cilk
|
||||
717
examples/deferred/kernels.ispc
Normal file
717
examples/deferred/kernels.ispc
Normal file
@@ -0,0 +1,717 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
|
||||
struct InputDataArrays
|
||||
{
|
||||
uniform float zBuffer[];
|
||||
uniform unsigned int16 normalEncoded_x[]; // half float
|
||||
uniform unsigned int16 normalEncoded_y[]; // half float
|
||||
uniform unsigned int16 specularAmount[]; // half float
|
||||
uniform unsigned int16 specularPower[]; // half float
|
||||
uniform unsigned int8 albedo_x[]; // unorm8
|
||||
uniform unsigned int8 albedo_y[]; // unorm8
|
||||
uniform unsigned int8 albedo_z[]; // unorm8
|
||||
uniform float lightPositionView_x[];
|
||||
uniform float lightPositionView_y[];
|
||||
uniform float lightPositionView_z[];
|
||||
uniform float lightAttenuationBegin[];
|
||||
uniform float lightColor_x[];
|
||||
uniform float lightColor_y[];
|
||||
uniform float lightColor_z[];
|
||||
uniform float lightAttenuationEnd[];
|
||||
};
|
||||
|
||||
struct InputHeader
|
||||
{
|
||||
uniform float cameraProj[4][4];
|
||||
uniform float cameraNear;
|
||||
uniform float cameraFar;
|
||||
|
||||
uniform int32 framebufferWidth;
|
||||
uniform int32 framebufferHeight;
|
||||
uniform int32 numLights;
|
||||
uniform int32 inputDataChunkSize;
|
||||
uniform int32 inputDataArrayOffsets[idaNum];
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Common utility routines
|
||||
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, reference float ox,
|
||||
reference float oy, reference float oz) {
|
||||
float n = rsqrt(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
Unorm8ToFloat32(unsigned int8 u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline unsigned int8
|
||||
Float32ToUnorm8(float f) {
|
||||
return (unsigned int8)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
static void
|
||||
ComputeZBounds(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
uniform int32 gBufferWidth,
|
||||
// Camera data
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
reference uniform float minZ,
|
||||
reference uniform float maxZ
|
||||
)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[(y * gBufferWidth + x) + programIndex];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
minZ = reduce_min(laneMinZ);
|
||||
maxZ = reduce_max(laneMaxZ);
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
// numLights must currently be a multiple of programCount (SIMD size)
|
||||
export uniform int32
|
||||
IntersectLightsWithTileMinMax(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
// Tile data
|
||||
uniform float minZ,
|
||||
uniform float maxZ,
|
||||
// G-buffer data
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
reference uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
// Parallize across frustum planes.
|
||||
// We really only have four side planes here, but write the code to
|
||||
// handle programCount > 4 robustly
|
||||
uniform float frustumPlanes_xy[programCount];
|
||||
uniform float frustumPlanes_z[programCount];
|
||||
|
||||
// TODO: If programIndex < 4 here? Don't care about masking off the
|
||||
// rest but if interleaving ("x2" modes) the other lanes should ideally
|
||||
// not be emitted...
|
||||
{
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v;
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2, (cameraProj_22 * gBufferScale_y));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
|
||||
|
||||
float frustumPlanes_z_v;
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileEndX - gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 2, tileEndY - gBufferScale_y);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
|
||||
|
||||
// Normalize
|
||||
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||
frustumPlanes_xy_v *= norm;
|
||||
frustumPlanes_z_v *= norm;
|
||||
|
||||
// Save out for uniform use later
|
||||
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||
}
|
||||
|
||||
uniform int32 tileNumLights = 0;
|
||||
|
||||
for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights;
|
||||
baseLightIndex += programCount) {
|
||||
int32 lightIndex = baseLightIndex + programIndex;
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// This seems better than cif(!inFrustum) ccontinue; here since we
|
||||
// don't actually need to mask the rest of this function - this is
|
||||
// just a greedy early-out. Could also structure all of this as
|
||||
// nested if() statements, but this a bit easier to read
|
||||
if (!any(inFrustum))
|
||||
continue;
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
cif (inFrustum) {
|
||||
tileNumLights += packed_store_active(tileLightIndices, tileNumLights,
|
||||
lightIndex);
|
||||
}
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
// numLights must currently be a multiple of programCount (SIMD size)
|
||||
static uniform int32
|
||||
IntersectLightsWithTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Light Data
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
reference uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float minZ, maxZ;
|
||||
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
|
||||
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
|
||||
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
|
||||
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
light_positionView_z_array, light_attenuationEnd_array,
|
||||
tileLightIndices);
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
// tile width must be a multiple of programCount (SIMD size)
|
||||
export void
|
||||
ShadeTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
reference uniform InputDataArrays inputData,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
// Light list
|
||||
reference uniform int32 tileLightIndices[],
|
||||
uniform int32 tileNumLights,
|
||||
// UI
|
||||
uniform bool visualizeLightCount,
|
||||
// Output
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
|
||||
uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
|
||||
int32 gBufferOffset = gBufferOffsetBase + programIndex;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x + programIndex)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
uniform float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
uniform float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
uniform float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
uniform float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
cif (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrt(distanceToLight2);
|
||||
|
||||
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||
float distanceToLightRcp = rcp(distanceToLight);
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
cif (NdotL > 0.0f) {
|
||||
uniform float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = max(NdotH, 0.0f);
|
||||
|
||||
float specular = pow(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
uniform float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
uniform float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
uniform float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
// These pows are pretty slow right now, but we can do
|
||||
// something faster if really necessary to squeeze every
|
||||
// last bit of performance out of it
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
|
||||
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
|
||||
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Static decomposition
|
||||
|
||||
task void
|
||||
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
reference uniform InputHeader inputHeader,
|
||||
reference uniform InputDataArrays inputData,
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int32 group_y = taskIndex / num_groups_x;
|
||||
uniform int32 group_x = taskIndex % num_groups_x;
|
||||
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||
|
||||
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
||||
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
||||
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||
uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
|
||||
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||
|
||||
// Light intersection: figure out which lights illuminate this tile.
|
||||
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||
uniform int numTileLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight,
|
||||
inputData.zBuffer,
|
||||
cameraProj_00, cameraProj_11,
|
||||
cameraProj_22, cameraProj_32,
|
||||
inputHeader.cameraNear, inputHeader.cameraFar,
|
||||
MAX_LIGHTS,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightAttenuationEnd,
|
||||
tileLightIndices);
|
||||
|
||||
// And now shade the tile, using the lights in tileLightIndices
|
||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight, inputData,
|
||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
RenderStatic(reference uniform InputHeader inputHeader,
|
||||
reference uniform InputDataArrays inputData,
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
reference uniform unsigned int8 framebuffer_r[],
|
||||
reference uniform unsigned int8 framebuffer_g[],
|
||||
reference uniform unsigned int8 framebuffer_b[]) {
|
||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||
uniform int num_groups = num_groups_x * num_groups_y;
|
||||
|
||||
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||
// by MIN_TILE_HEIGHT pixels.
|
||||
launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
|
||||
inputHeader, inputData, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Routines for dynamic decomposition path
|
||||
|
||||
// This computes the z min/max range for a whole row worth of tiles.
|
||||
// The tile width must be a multiple of programCount (SIMD size)
|
||||
export void
|
||||
ComputeZBoundsRow(
|
||||
uniform int32 tileY,
|
||||
uniform int32 tileWidth, uniform int32 tileHeight,
|
||||
uniform int32 numTilesX, uniform int32 numTilesY,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
uniform int32 gBufferWidth,
|
||||
// Camera data
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
reference uniform float minZArray[],
|
||||
reference uniform float maxZArray[]
|
||||
)
|
||||
{
|
||||
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
uniform float minZ, maxZ;
|
||||
ComputeZBounds(
|
||||
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
|
||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||
// should be able to handle programCount-sized load/stores.
|
||||
export void
|
||||
SplitTileMinMax(
|
||||
uniform int32 tileMidX, uniform int32 tileMidY,
|
||||
// Subtile data (00, 10, 01, 11)
|
||||
uniform float subtileMinZ[],
|
||||
uniform float subtileMaxZ[],
|
||||
// G-buffer data
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
reference uniform int32 lightIndices[],
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
// TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
|
||||
// indexing math ourselves
|
||||
reference uniform int32 subtileIndices[],
|
||||
uniform int32 subtileIndicesPitch,
|
||||
reference uniform int32 subtileNumLights[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
// Parallize across frustum planes
|
||||
// Only have 2 frustum split planes here so may not be worth it, but
|
||||
// we'll do it for now for consistency
|
||||
uniform float frustumPlanes_xy[programCount];
|
||||
uniform float frustumPlanes_z[programCount];
|
||||
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v;
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_22 * gBufferScale_y));
|
||||
|
||||
float frustumPlanes_z_v;
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
|
||||
|
||||
// Normalize
|
||||
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||
frustumPlanes_xy_v *= norm;
|
||||
frustumPlanes_z_v *= norm;
|
||||
|
||||
// Save out for uniform use later
|
||||
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||
|
||||
// Initialize
|
||||
uniform int32 subtileLightOffset[4];
|
||||
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
for (int32 i = programIndex; i < numLights; i += programCount) {
|
||||
// TODO: ISPC says gather required here when it actually
|
||||
// isn't... this could be fixed this by nesting an if() within a
|
||||
// uniform loop, but I'm not totally sure if that's a win
|
||||
// overall. For now we'll just eat the perf cost for cleanliness
|
||||
// since the below are real gathers anyways.
|
||||
int32 lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
cif (abs(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||
}
|
||||
cif (abs(dy) > light_attenuationEnd) {
|
||||
bool positiveY = dy > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||
}
|
||||
|
||||
// Pack and store intersecting lights
|
||||
// TODO: Experiment with a loop here instead
|
||||
cif (inFrustum[0])
|
||||
subtileLightOffset[0] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[0],
|
||||
lightIndex);
|
||||
cif (inFrustum[1])
|
||||
subtileLightOffset[1] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[1],
|
||||
lightIndex);
|
||||
cif (inFrustum[2])
|
||||
subtileLightOffset[2] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[2],
|
||||
lightIndex);
|
||||
cif (inFrustum[3])
|
||||
subtileLightOffset[3] += packed_store_active(subtileIndices,
|
||||
subtileLightOffset[3],
|
||||
lightIndex);
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||
}
|
||||
139
examples/deferred/main.cpp
Normal file
139
examples/deferred/main.cpp
Normal file
@@ -0,0 +1,139 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include "../timing.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc != 2) {
|
||||
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
InputData *input = CreateInputDataFromFile(argv[1]);
|
||||
if (!input) {
|
||||
printf("Failed to load input file \"%s\"!\n", argv[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Framebuffer framebuffer(input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
|
||||
InitDynamicC(input);
|
||||
#ifdef __cilk
|
||||
InitDynamicCilk(input);
|
||||
#endif // __cilk
|
||||
|
||||
int nframes = 5;
|
||||
double ispcCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
ispc::RenderStatic(&input->header, &input->arrays,
|
||||
VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer.r, framebuffer.g, framebuffer.b);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
ispcCycles = std::min(ispcCycles, mcycles);
|
||||
}
|
||||
printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
|
||||
"%d x %d image\n", ispcCycles,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilk
|
||||
double dynamicCilkCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicCilk(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||
}
|
||||
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n",
|
||||
dynamicCilkCycles);
|
||||
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||
#endif // __cilk
|
||||
|
||||
double serialCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicC(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
serialCycles = std::min(serialCycles, mcycles);
|
||||
}
|
||||
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n",
|
||||
serialCycles);
|
||||
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilk
|
||||
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
||||
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
||||
#else
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
||||
#endif // __cilk
|
||||
|
||||
DeleteInputData(input);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -18,8 +18,11 @@ EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
@@ -108,6 +111,14 @@ Global
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
|
||||
{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
|
||||
{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
|
||||
default: mandelbrot
|
||||
|
||||
@@ -14,13 +14,17 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ mandelbrot
|
||||
|
||||
mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
|
||||
OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
|
||||
objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
|
||||
objs/mandelbrot_ispc.o
|
||||
|
||||
mandelbrot: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -68,38 +67,6 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 512;
|
||||
@@ -111,8 +78,6 @@ int main() {
|
||||
int maxIterations = 256;
|
||||
int *buf = new int[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
|
||||
24
examples/mandelbrot/mandelbrot.vcxproj
Executable file → Normal file
24
examples/mandelbrot/mandelbrot.vcxproj
Executable file → Normal file
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -147,18 +155,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasks_pthreads.cpp
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=../tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
|
||||
objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
|
||||
objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o
|
||||
|
||||
default: mandelbrot
|
||||
|
||||
@@ -26,8 +24,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ mandelbrot
|
||||
|
||||
mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
mandelbrot: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -37,5 +35,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/mandelbrot.o: objs/mandelbrot_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -42,7 +42,6 @@
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -69,39 +68,8 @@ writePPM(int *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
static void usage() {
|
||||
fprintf(stderr, "usage: mandelbrot [--scale=<factor]\n");
|
||||
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -132,8 +100,6 @@ int main(int argc, char *argv[]) {
|
||||
else
|
||||
usage();
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
int maxIterations = 512;
|
||||
int *buf = new int[width*height];
|
||||
|
||||
@@ -143,6 +109,9 @@ int main(int argc, char *argv[]) {
|
||||
//
|
||||
double minISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
reset_and_start_timer();
|
||||
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
@@ -152,9 +121,6 @@ int main(int argc, char *argv[]) {
|
||||
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
||||
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
||||
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
@@ -162,6 +128,9 @@ int main(int argc, char *argv[]) {
|
||||
//
|
||||
double minSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
reset_and_start_timer();
|
||||
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
|
||||
@@ -53,11 +53,14 @@ mandel(float c_re, float c_im, int count) {
|
||||
[ystart,yend).
|
||||
*/
|
||||
task void
|
||||
mandelbrot_scanlines(uniform int ystart, uniform int yend,
|
||||
mandelbrot_scanlines(uniform int ybase, uniform int span,
|
||||
uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int maxIterations,
|
||||
reference uniform int output[]) {
|
||||
uniform int ystart = ybase + taskIndex * span;
|
||||
uniform int yend = ystart + span;
|
||||
|
||||
for (uniform int j = ystart; j < yend; ++j) {
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
float x = x0 + (programIndex + i) * dx;
|
||||
@@ -70,6 +73,20 @@ mandelbrot_scanlines(uniform int ystart, uniform int yend,
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
mandelbrot_chunk(uniform float x0, uniform float dx,
|
||||
uniform float y0, uniform float dy,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations, reference uniform int output[]) {
|
||||
uniform int ystart = taskIndex * (height/taskCount);
|
||||
uniform int yend = (taskIndex+1) * (height/taskCount);
|
||||
uniform int span = 1;
|
||||
|
||||
launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
|
||||
width, maxIterations, output) >;
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float x1, uniform float y1,
|
||||
@@ -78,9 +95,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float dx = (x1 - x0) / width;
|
||||
uniform float dy = (y1 - y0) / height;
|
||||
|
||||
/* Launch task to compute results for spans of 'span' scanlines. */
|
||||
uniform int span = 2;
|
||||
for (uniform int j = 0; j < height; j += span)
|
||||
launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
|
||||
maxIterations, output) >;
|
||||
launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
|
||||
maxIterations, output) >;
|
||||
}
|
||||
|
||||
26
examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
Executable file → Normal file
26
examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
Executable file → Normal file
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -143,23 +151,23 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="mandelbrot.cpp" />
|
||||
<ClCompile Include="mandelbrot_serial.cpp" />
|
||||
<ClCompile Include="../tasks_concrt.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -2,7 +2,10 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
|
||||
objs/noise_ispc_sse4.o objs/noise_ispc_avx.o
|
||||
|
||||
default: noise
|
||||
|
||||
@@ -14,13 +17,13 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ noise
|
||||
|
||||
noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
|
||||
noise: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/noise.o: objs/noise_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "noise_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -66,38 +65,6 @@ writePPM(float *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 768;
|
||||
@@ -108,8 +75,6 @@ int main() {
|
||||
|
||||
float *buf = new float[width*height];
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
|
||||
28
examples/noise/noise.vcxproj
Executable file → Normal file
28
examples/noise/noise.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -147,21 +155,21 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="noise.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -2,7 +2,11 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -g -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
|
||||
objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
|
||||
objs/options_ispc_avx.o
|
||||
|
||||
default: options
|
||||
|
||||
@@ -14,13 +18,13 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ options
|
||||
|
||||
options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
|
||||
options: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/options.o: objs/options_ispc.h options_defs.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -41,7 +41,6 @@ using std::max;
|
||||
|
||||
#include "options_defs.h"
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
#include "options_ispc.h"
|
||||
using namespace ispc;
|
||||
@@ -54,41 +53,7 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count);
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
float *S = new float[N_OPTIONS];
|
||||
float *X = new float[N_OPTIONS];
|
||||
float *T = new float[N_OPTIONS];
|
||||
|
||||
26
examples/options/options.vcxproj
Executable file → Normal file
26
examples/options/options.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
@@ -97,6 +102,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
@@ -115,6 +121,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -134,6 +141,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4305</DisableSpecificWarnings>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -151,18 +159,18 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="options.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
|
||||
@@ -1,20 +1,17 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasks_pthreads.cpp
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=../tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
|
||||
|
||||
OBJS=objs/rt.o objs/rt_serial.o $(TASK_OBJ) objs/rt_ispc.o objs/rt_ispc_sse2.o \
|
||||
objs/rt_ispc_sse4.o objs/rt_ispc_avx.o
|
||||
|
||||
default: rt
|
||||
|
||||
@@ -26,8 +23,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ rt
|
||||
|
||||
rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
rt: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -37,5 +34,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/rt.o: objs/rt_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -45,7 +45,6 @@
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "rt_ispc.h"
|
||||
|
||||
using namespace ispc;
|
||||
@@ -96,38 +95,6 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void usage() {
|
||||
fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
|
||||
exit(1);
|
||||
@@ -151,8 +118,6 @@ int main(int argc, char *argv[]) {
|
||||
if (filename == NULL)
|
||||
usage();
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
#define READ(var, n) \
|
||||
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
|
||||
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
|
||||
@@ -203,12 +168,12 @@ int main(int argc, char *argv[]) {
|
||||
// of node, the total number of int it if a leaf node, etc.
|
||||
float b[6];
|
||||
READ(b[0], 6);
|
||||
nodes[i].bounds[0].v[0] = b[0];
|
||||
nodes[i].bounds[0].v[1] = b[1];
|
||||
nodes[i].bounds[0].v[2] = b[2];
|
||||
nodes[i].bounds[1].v[0] = b[3];
|
||||
nodes[i].bounds[1].v[1] = b[4];
|
||||
nodes[i].bounds[1].v[2] = b[5];
|
||||
nodes[i].bounds[0][0] = b[0];
|
||||
nodes[i].bounds[0][1] = b[1];
|
||||
nodes[i].bounds[0][2] = b[2];
|
||||
nodes[i].bounds[1][0] = b[3];
|
||||
nodes[i].bounds[1][1] = b[4];
|
||||
nodes[i].bounds[1][2] = b[5];
|
||||
READ(nodes[i].offset, 1);
|
||||
READ(nodes[i].nPrimitives, 1);
|
||||
READ(nodes[i].splitAxis, 1);
|
||||
@@ -225,9 +190,9 @@ int main(int argc, char *argv[]) {
|
||||
READ(v[0], 9);
|
||||
float *vp = v;
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
triangles[i].p[j].v[0] = *vp++;
|
||||
triangles[i].p[j].v[1] = *vp++;
|
||||
triangles[i].p[j].v[2] = *vp++;
|
||||
triangles[i].p[j][0] = *vp++;
|
||||
triangles[i].p[j][1] = *vp++;
|
||||
triangles[i].p[j][2] = *vp++;
|
||||
}
|
||||
// And create an object id
|
||||
triangles[i].id = i+1;
|
||||
|
||||
@@ -43,12 +43,13 @@ struct Ray {
|
||||
};
|
||||
|
||||
struct Triangle {
|
||||
uniform float3 p[3];
|
||||
uniform float p[3][4];
|
||||
uniform int id;
|
||||
uniform int pad[3];
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
uniform float3 bounds[2];
|
||||
uniform float bounds[2][3];
|
||||
uniform unsigned int offset; // num primitives for leaf, second child for interior
|
||||
uniform unsigned int8 nPrimitives;
|
||||
uniform unsigned int8 splitAxis;
|
||||
@@ -103,14 +104,16 @@ static void generateRay(uniform const float raster2camera[4][4],
|
||||
}
|
||||
|
||||
|
||||
static inline bool BBoxIntersect(const reference uniform float3 bounds[2],
|
||||
static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
const reference Ray ray) {
|
||||
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
float t0 = ray.mint, t1 = ray.maxt;
|
||||
|
||||
// Check all three axis-aligned slabs. Don't try to early out; it's
|
||||
// not worth the trouble
|
||||
float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds[1] - ray.origin) * ray.invDir;
|
||||
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
@@ -141,8 +144,11 @@ static inline bool BBoxIntersect(const reference uniform float3 bounds[2],
|
||||
|
||||
|
||||
static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
|
||||
uniform float3 e1 = tri.p[1] - tri.p[0];
|
||||
uniform float3 e2 = tri.p[2] - tri.p[0];
|
||||
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||
uniform float3 e1 = p1 - p0;
|
||||
uniform float3 e2 = p2 - p0;
|
||||
|
||||
float3 s1 = Cross(ray.dir, e2);
|
||||
float divisor = Dot(s1, e1);
|
||||
@@ -153,7 +159,7 @@ static inline bool TriIntersect(const reference Triangle tri, reference Ray ray)
|
||||
float invDivisor = 1.f / divisor;
|
||||
|
||||
// Compute first barycentric coordinate
|
||||
float3 d = ray.origin - tri.p[0];
|
||||
float3 d = ray.origin - p0;
|
||||
float b1 = Dot(d, s1) * invDivisor;
|
||||
if (b1 < 0. || b1 > 1.)
|
||||
hit = false;
|
||||
@@ -283,15 +289,20 @@ export void raytrace_ispc(uniform int width, uniform int height,
|
||||
}
|
||||
|
||||
|
||||
task void raytrace_tile_task(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int width, uniform int height,
|
||||
task void raytrace_tile_task(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
uniform int dx = 16, dy = 16; // must match dx, dy below
|
||||
uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
uniform int x0 = (taskIndex % xBuckets) * dx;
|
||||
uniform int x1 = min(x0 + dx, width);
|
||||
uniform int y0 = (taskIndex / xBuckets) * dy;
|
||||
uniform int y1 = min(y0 + dy, height);
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
@@ -306,13 +317,11 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
uniform int dx = 16, dy = 16;
|
||||
for (uniform int y = 0; y < height; y += dy) {
|
||||
uniform int y1 = min(y + dy, height);
|
||||
for (uniform int x = 0; x < width; x += dx) {
|
||||
uniform int x1 = min(x + dx, width);
|
||||
launch < raytrace_tile_task(x, x1, y, y1, width, height, baseWidth,
|
||||
baseHeight, raster2camera, camera2world,
|
||||
uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
uniform int yBuckets = (height + (dy-1)) / dy;
|
||||
uniform int nTasks = xBuckets * yBuckets;
|
||||
launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles) >;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
26
examples/rt/rt.vcxproj
Executable file → Normal file
26
examples/rt/rt.vcxproj
Executable file → Normal file
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -144,27 +152,27 @@
|
||||
<CustomBuild Include="rt.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="rt.cpp" />
|
||||
<ClCompile Include="rt_serial.cpp" />
|
||||
<ClCompile Include="../tasks_concrt.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
|
||||
@@ -75,12 +75,13 @@ struct Ray {
|
||||
// Declare these in a namespace so the mangling matches
|
||||
namespace ispc {
|
||||
struct Triangle {
|
||||
float3 p[3];
|
||||
float p[3][4]; // extra float pad after each vertex
|
||||
int32_t id;
|
||||
int32_t pad[3]; // make 16 x 32-bits
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
float3 bounds[2];
|
||||
float bounds[2][3];
|
||||
int32_t offset; // primitives for leaf, second child for interior
|
||||
uint8_t nPrimitives;
|
||||
uint8_t splitAxis;
|
||||
@@ -140,12 +141,14 @@ static void generateRay(const float raster2camera[4][4],
|
||||
}
|
||||
|
||||
|
||||
static inline bool BBoxIntersect(const float3 bounds[2],
|
||||
static inline bool BBoxIntersect(const float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
float3 bounds0(bounds[0][0], bounds[0][1], bounds[0][2]);
|
||||
float3 bounds1(bounds[1][0], bounds[1][1], bounds[1][2]);
|
||||
float t0 = ray.mint, t1 = ray.maxt;
|
||||
|
||||
float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds[1] - ray.origin) * ray.invDir;
|
||||
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
@@ -176,8 +179,11 @@ static inline bool BBoxIntersect(const float3 bounds[2],
|
||||
|
||||
|
||||
inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
float3 e1 = tri.p[1] - tri.p[0];
|
||||
float3 e2 = tri.p[2] - tri.p[0];
|
||||
float3 p0(tri.p[0][0], tri.p[0][1], tri.p[0][2]);
|
||||
float3 p1(tri.p[1][0], tri.p[1][1], tri.p[1][2]);
|
||||
float3 p2(tri.p[2][0], tri.p[2][1], tri.p[2][2]);
|
||||
float3 e1 = p1 - p0;
|
||||
float3 e2 = p2 - p0;
|
||||
|
||||
float3 s1 = Cross(ray.dir, e2);
|
||||
float divisor = Dot(s1, e1);
|
||||
@@ -187,7 +193,7 @@ inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
float invDivisor = 1.f / divisor;
|
||||
|
||||
// Compute first barycentric coordinate
|
||||
float3 d = ray.origin - tri.p[0];
|
||||
float3 d = ray.origin - p0;
|
||||
float b1 = Dot(d, s1) * invDivisor;
|
||||
if (b1 < 0. || b1 > 1.)
|
||||
return false;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
CXX=g++ -m64
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --arch=x86-64 --target=sse2
|
||||
|
||||
default: simple
|
||||
|
||||
|
||||
@@ -33,47 +33,12 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "../cpuid.h"
|
||||
|
||||
// Include the header file that the ispc compiler generates
|
||||
#include "simple_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
float vin[16], vout[16];
|
||||
|
||||
// Initialize input buffer
|
||||
|
||||
26
examples/simple/simple.vcxproj
Executable file → Normal file
26
examples/simple/simple.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -25,21 +25,21 @@
|
||||
<CustomBuild Include="simple.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
@@ -88,15 +88,19 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -105,6 +109,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -118,6 +123,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -133,6 +139,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -150,6 +157,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasks_pthreads.cpp
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=../tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
|
||||
|
||||
OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
|
||||
objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
|
||||
objs/stencil_ispc_avx.o
|
||||
|
||||
default: stencil
|
||||
|
||||
@@ -26,8 +24,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ stencil
|
||||
|
||||
stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
stencil: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -37,5 +35,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/stencil.o: objs/stencil_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -42,43 +42,10 @@
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "stencil_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||
int y0, int y1, int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
@@ -100,8 +67,6 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
|
||||
|
||||
|
||||
int main() {
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
int Nx = 256, Ny = 256, Nz = 256;
|
||||
int width = 4;
|
||||
float *Aserial[2], *Aispc[2];
|
||||
|
||||
26
examples/stencil/stencil.vcxproj
Executable file → Normal file
26
examples/stencil/stencil.vcxproj
Executable file → Normal file
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -144,27 +152,27 @@
|
||||
<CustomBuild Include="stencil.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="stencil.cpp" />
|
||||
<ClCompile Include="stencil_serial.cpp" />
|
||||
<ClCompile Include="../tasks_concrt.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
|
||||
@@ -1,180 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef TASKINFO_H
|
||||
#define TASKINFO_H 1
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include <concrt.h>
|
||||
using namespace Concurrency;
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
|
||||
#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
|
||||
#define ISPC_POINTER_BYTES 4
|
||||
#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
|
||||
#define ISPC_POINTER_BYTES 8
|
||||
#else
|
||||
#error "Pointer size unknown!"
|
||||
#endif // __SIZEOF_POINTER__
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
typedef struct TaskInfo {
|
||||
void *func;
|
||||
void *data;
|
||||
#if defined(ISPC_IS_WINDOWS)
|
||||
event taskEvent;
|
||||
#endif
|
||||
} TaskInfo;
|
||||
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static int32_t
|
||||
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
|
||||
int32_t result;
|
||||
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
__asm__ __volatile__("mfence":::"memory");
|
||||
return result;
|
||||
}
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
|
||||
|
||||
static void *
|
||||
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedCompareExchangePointer(v, newValue, oldValue);
|
||||
#else
|
||||
void *result;
|
||||
#if (ISPC_POINTER_BYTES == 4)
|
||||
__asm__ __volatile__("lock\ncmpxchgd %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
#else
|
||||
__asm__ __volatile__("lock\ncmpxchgq %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
#endif // ISPC_POINTER_BYTES
|
||||
__asm__ __volatile__("mfence":::"memory");
|
||||
return result;
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static int32_t
|
||||
lAtomicAdd32(volatile int32_t *v, int32_t delta) {
|
||||
// Do atomic add with gcc x86 inline assembly
|
||||
int32_t origValue;
|
||||
__asm__ __volatile__("lock\n"
|
||||
"xaddl %0,%1"
|
||||
: "=r"(origValue), "=m"(*v) : "0"(delta)
|
||||
: "memory");
|
||||
return origValue;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define LOG_TASK_QUEUE_CHUNK_SIZE 13
|
||||
#define MAX_TASK_QUEUE_CHUNKS 1024
|
||||
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
static volatile LONG nextTaskInfoCoordinate;
|
||||
#else
|
||||
static volatile int nextTaskInfoCoordinate;
|
||||
#endif
|
||||
|
||||
static TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
|
||||
|
||||
static inline void
|
||||
lInitTaskInfo() {
|
||||
taskInfo[0] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
|
||||
}
|
||||
|
||||
|
||||
static inline TaskInfo *
|
||||
lGetTaskInfo() {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
int myCoord = InterlockedAdd(&nextTaskInfoCoordinate, 1)-1;
|
||||
#else
|
||||
int myCoord = lAtomicAdd32(&nextTaskInfoCoordinate, 1);
|
||||
#endif
|
||||
int index = (myCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
||||
int offset = myCoord & (TASK_QUEUE_CHUNK_SIZE-1);
|
||||
if (index == MAX_TASK_QUEUE_CHUNKS) {
|
||||
fprintf(stderr, "A total of %d tasks have been launched--the simple "
|
||||
"built-in task system can handle no more. Exiting.", myCoord);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (taskInfo[index] == NULL) {
|
||||
TaskInfo *newChunk = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
|
||||
if (lAtomicCompareAndSwapPointer((void **)&taskInfo[index], newChunk,
|
||||
NULL) != NULL) {
|
||||
// failure--someone else got it, but that's cool
|
||||
assert(taskInfo[index] != NULL);
|
||||
free(newChunk);
|
||||
}
|
||||
}
|
||||
|
||||
return &taskInfo[index][offset];
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
lResetTaskInfo() {
|
||||
nextTaskInfoCoordinate = 0;
|
||||
}
|
||||
|
||||
#endif // TASKINFO_H
|
||||
@@ -1,104 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "taskinfo.h"
|
||||
|
||||
/* Simple task system implementation for ispc based on Microsoft's
|
||||
Concurrency Runtime. */
|
||||
|
||||
#include <windows.h>
|
||||
#include <concrt.h>
|
||||
using namespace Concurrency;
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *f, void *data);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
}
|
||||
|
||||
|
||||
void __cdecl
|
||||
lRunTask(LPVOID param) {
|
||||
TaskInfo *ti = (TaskInfo *)param;
|
||||
|
||||
// Actually run the task.
|
||||
// FIXME: like the GCD implementation for OS X, this is passing bogus
|
||||
// values for the threadIndex and threadCount builtins, which in turn
|
||||
// will cause bugs in code that uses those.
|
||||
int threadIndex = 0;
|
||||
int threadCount = 1;
|
||||
TaskFuncType func = (TaskFuncType)ti->func;
|
||||
func(ti->data, threadIndex, threadCount);
|
||||
|
||||
// Signal the event that this task is done
|
||||
ti->taskEvent.set();
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ISPCLaunch(void *func, void *data) {
|
||||
TaskInfo *ti = lGetTaskInfo();
|
||||
ti->func = (TaskFuncType)func;
|
||||
ti->data = data;
|
||||
ti->taskEvent.reset();
|
||||
CurrentScheduler::ScheduleTask(lRunTask, ti);
|
||||
}
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
for (int i = 0; i < nextTaskInfoCoordinate; ++i) {
|
||||
int index = (i >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
||||
int offset = i & (TASK_QUEUE_CHUNK_SIZE-1);
|
||||
taskInfo[index][offset].taskEvent.wait();
|
||||
taskInfo[index][offset].taskEvent.reset();
|
||||
}
|
||||
|
||||
lResetTaskInfo();
|
||||
}
|
||||
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
return _aligned_malloc(size, alignment);
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
@@ -1,126 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "taskinfo.h"
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
/* A simple task system for ispc programs based on Apple's Grand Central
|
||||
Dispatch. */
|
||||
#include <dispatch/dispatch.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static int initialized = 0;
|
||||
static volatile int32_t lock = 0;
|
||||
static dispatch_queue_t gcdQueue;
|
||||
static dispatch_group_t gcdGroup;
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *f, void *data);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lRunTask(void *ti) {
|
||||
TaskInfo *taskInfo = (TaskInfo *)ti;
|
||||
// FIXME: these are bogus values; may cause bugs in code that depends
|
||||
// on them having unique values in different threads.
|
||||
int threadIndex = 0;
|
||||
int threadCount = 1;
|
||||
TaskFuncType func = (TaskFuncType)(taskInfo->func);
|
||||
|
||||
// Actually run the task
|
||||
func(taskInfo->data, threadIndex, threadCount);
|
||||
}
|
||||
|
||||
|
||||
void ISPCLaunch(void *func, void *data) {
|
||||
if (!initialized) {
|
||||
while (1) {
|
||||
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
||||
if (!initialized) {
|
||||
gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
||||
gcdGroup = dispatch_group_create();
|
||||
lInitTaskInfo();
|
||||
__asm__ __volatile__("mfence":::"memory");
|
||||
initialized = 1;
|
||||
}
|
||||
lock = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TaskInfo *ti = lGetTaskInfo();
|
||||
ti->func = func;
|
||||
ti->data = data;
|
||||
dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
|
||||
}
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
if (!initialized)
|
||||
return;
|
||||
|
||||
// Wait for all of the tasks in the group to complete before returning
|
||||
dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
|
||||
|
||||
lResetTaskInfo();
|
||||
}
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
free(((void**)ptr)[-1]);
|
||||
}
|
||||
|
||||
@@ -1,339 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include "taskinfo.h"
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
static int initialized = 0;
|
||||
static volatile int32_t lock = 0;
|
||||
|
||||
static int nThreads;
|
||||
static pthread_t *threads;
|
||||
static pthread_mutex_t taskQueueMutex;
|
||||
static int nextTaskToRun;
|
||||
static sem_t *workerSemaphore;
|
||||
static uint32_t numUnfinishedTasks;
|
||||
static pthread_mutex_t tasksRunningConditionMutex;
|
||||
static pthread_cond_t tasksRunningCondition;
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *f, void *data);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
}
|
||||
|
||||
static void *lTaskEntry(void *arg);
|
||||
|
||||
/** Figure out how many CPU cores there are in the system
|
||||
*/
|
||||
static int
|
||||
lNumCPUCores() {
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lTasksInit() {
|
||||
nThreads = lNumCPUCores();
|
||||
|
||||
threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
|
||||
|
||||
int err;
|
||||
if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
|
||||
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
char name[32];
|
||||
sprintf(name, "ispc_task.%d", (int)getpid());
|
||||
workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
|
||||
if (!workerSemaphore) {
|
||||
fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
|
||||
fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
|
||||
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < nThreads; ++i) {
|
||||
err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ISPCLaunch(void *f, void *d) {
|
||||
int err;
|
||||
|
||||
if (!initialized) {
|
||||
while (1) {
|
||||
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
||||
if (!initialized) {
|
||||
lTasksInit();
|
||||
__asm__ __volatile__("mfence":::"memory");
|
||||
initialized = 1;
|
||||
}
|
||||
lock = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Acquire mutex, add task
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Need a mutex here to ensure we get this filled in before a worker
|
||||
// grabs it and starts running...
|
||||
TaskInfo *ti = lGetTaskInfo();
|
||||
ti->func = f;
|
||||
ti->data = d;
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Update count of number of tasks left to run
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// FIXME: is this redundant with nextTaskInfoCoordinate?
|
||||
++numUnfinishedTasks;
|
||||
|
||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Post to the worker semaphore to wake up worker threads that are
|
||||
// sleeping waiting for tasks to show up
|
||||
//
|
||||
if ((err = sem_post(workerSemaphore)) != 0) {
|
||||
fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void *
|
||||
lTaskEntry(void *arg) {
|
||||
int threadIndex = (int)((int64_t)arg);
|
||||
int threadCount = nThreads;
|
||||
TaskFuncType func;
|
||||
|
||||
while (1) {
|
||||
int err;
|
||||
if ((err = sem_wait(workerSemaphore)) != 0) {
|
||||
fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Acquire mutex, get task
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (nextTaskToRun == nextTaskInfoCoordinate) {
|
||||
//
|
||||
// Task queue is empty, go back and wait on the semaphore
|
||||
//
|
||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
int runCoord = nextTaskToRun++;
|
||||
int index = (runCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
||||
int offset = runCoord & (TASK_QUEUE_CHUNK_SIZE-1);
|
||||
TaskInfo *myTask = &taskInfo[index][offset];
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Do work for _myTask_
|
||||
//
|
||||
func = (TaskFuncType)myTask->func;
|
||||
func(myTask->data, threadIndex, threadCount);
|
||||
|
||||
//
|
||||
// Decrement the number of unfinished tasks counter
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// FIXME: can this be a comparison of (nextTaskToRun == nextTaskInfoCoordinate)?
|
||||
// (I don't think so--think there is a race...)
|
||||
int unfinished = --numUnfinishedTasks;
|
||||
if (unfinished == 0) {
|
||||
//
|
||||
// Signal the "no more tasks are running" condition if all of
|
||||
// them are done.
|
||||
//
|
||||
int err;
|
||||
if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
pthread_exit(NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
int err;
|
||||
if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// As long as there are tasks running, wait on the condition variable;
|
||||
// doing so causes this thread to go to sleep until someone signals on
|
||||
// the tasksRunningCondition condition variable.
|
||||
while (numUnfinishedTasks > 0) {
|
||||
if ((err = pthread_cond_wait(&tasksRunningCondition,
|
||||
&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
lResetTaskInfo();
|
||||
nextTaskToRun = 0;
|
||||
|
||||
// We acquire ownership of the condition variable mutex when the above
|
||||
// pthread_cond_wait returns.
|
||||
// FIXME: is there a lurking issue here if numUnfinishedTasks gets back
|
||||
// to zero by the time we get to ISPCSync() and thence we're trying to
|
||||
// unlock a mutex we don't have a lock on?
|
||||
if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
865
examples/tasksys.cpp
Normal file
865
examples/tasksys.cpp
Normal file
@@ -0,0 +1,865 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
This file implements simple task systems that provide the three
|
||||
entrypoints used by ispc-generated to code to handle 'launch' and 'sync'
|
||||
statements in ispc programs. See the section "Task Parallelism: Language
|
||||
Syntax" in the ispc documentation for information about using task
|
||||
parallelism in ispc programs, and see the section "Task Parallelism:
|
||||
Runtime Requirements" for information about the task-related entrypoints
|
||||
that are implemented here.
|
||||
|
||||
There are three task systems in this file: one built using Microsoft's
|
||||
Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
|
||||
one built on top of bare pthreads.
|
||||
*/
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#define ISPC_USE_CONCRT
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#define ISPC_USE_PTHREADS
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#define ISPC_USE_GCD
|
||||
#endif
|
||||
|
||||
#define DBG(x)
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
#ifdef ISPC_USE_CONCRT
|
||||
#include <concrt.h>
|
||||
using namespace Concurrency;
|
||||
#endif // ISPC_USE_CONCRT
|
||||
#ifdef ISPC_USE_GCD
|
||||
#include <dispatch/dispatch.h>
|
||||
#include <pthread.h>
|
||||
#endif // ISPC_USE_GCD
|
||||
#ifdef ISPC_USE_PTHREADS
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
|
||||
// Signature of ispc-generated 'task' functions
|
||||
typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
|
||||
int taskIndex, int taskCount);
|
||||
|
||||
// Small structure used to hold the data for each task
|
||||
struct TaskInfo {
|
||||
TaskFuncType func;
|
||||
void *data;
|
||||
int taskIndex, taskCount;
|
||||
#if defined(ISPC_IS_WINDOWS)
|
||||
event taskEvent;
|
||||
#endif
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// TaskGroupBase
|
||||
|
||||
#define LOG_TASK_QUEUE_CHUNK_SIZE 14
|
||||
#define MAX_TASK_QUEUE_CHUNKS 8
|
||||
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
#define NUM_MEM_BUFFERS 16
|
||||
|
||||
class TaskGroup;
|
||||
|
||||
/** The TaskGroupBase structure provides common functionality for "task
|
||||
groups"; a task group is the set of tasks launched from within a single
|
||||
ispc function. When the function is ready to return, it waits for all
|
||||
of the tasks in its task group to finish before it actually returns.
|
||||
*/
|
||||
class TaskGroupBase {
|
||||
public:
|
||||
void Reset();
|
||||
|
||||
int AllocTaskInfo(int count);
|
||||
TaskInfo *GetTaskInfo(int index);
|
||||
|
||||
void *AllocMemory(int64_t size, int32_t alignment);
|
||||
|
||||
protected:
|
||||
TaskGroupBase();
|
||||
~TaskGroupBase();
|
||||
|
||||
int nextTaskInfoIndex;
|
||||
|
||||
private:
|
||||
/* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
|
||||
needed by the calling function. We hold up to MAX_TASK_QUEUE_CHUNKS
|
||||
of these (and then exit at runtime if more than this many tasks are
|
||||
launched.)
|
||||
*/
|
||||
TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
|
||||
|
||||
/* We also allocate chunks of memory to service ISPCAlloc() calls. The
|
||||
memBuffers[] array holds pointers to this memory. The first element
|
||||
of this array is initialized to point to mem and then any subsequent
|
||||
elements required are initialized with dynamic allocation.
|
||||
*/
|
||||
int curMemBuffer, curMemBufferOffset;
|
||||
int memBufferSize[NUM_MEM_BUFFERS];
|
||||
char *memBuffers[NUM_MEM_BUFFERS];
|
||||
char mem[256];
|
||||
};
|
||||
|
||||
|
||||
inline TaskGroupBase::TaskGroupBase() {
|
||||
nextTaskInfoIndex = 0;
|
||||
|
||||
curMemBuffer = 0;
|
||||
curMemBufferOffset = 0;
|
||||
memBuffers[0] = mem;
|
||||
memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
|
||||
for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
|
||||
memBuffers[i] = NULL;
|
||||
memBufferSize[i] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
|
||||
taskInfo[i] = NULL;
|
||||
}
|
||||
|
||||
|
||||
inline TaskGroupBase::~TaskGroupBase() {
|
||||
// Note: don't delete memBuffers[0], since it points to the start of
|
||||
// the "mem" member!
|
||||
for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
|
||||
delete[] memBuffers[i];
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroupBase::Reset() {
|
||||
nextTaskInfoIndex = 0;
|
||||
curMemBuffer = 0;
|
||||
curMemBufferOffset = 0;
|
||||
}
|
||||
|
||||
|
||||
inline int
|
||||
TaskGroupBase::AllocTaskInfo(int count) {
|
||||
int ret = nextTaskInfoIndex;
|
||||
nextTaskInfoIndex += count;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
inline TaskInfo *
|
||||
TaskGroupBase::GetTaskInfo(int index) {
|
||||
int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
||||
int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
|
||||
|
||||
if (chunk == MAX_TASK_QUEUE_CHUNKS) {
|
||||
fprintf(stderr, "A total of %d tasks have been launched from the "
|
||||
"current function--the simple built-in task system can handle "
|
||||
"no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
|
||||
"and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation. "
|
||||
"Sorry! Exiting.\n", index);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (taskInfo[chunk] == NULL)
|
||||
taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
|
||||
return &taskInfo[chunk][offset];
|
||||
}
|
||||
|
||||
|
||||
inline void *
|
||||
TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
|
||||
char *basePtr = memBuffers[curMemBuffer];
|
||||
int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
|
||||
iptr = (iptr + (alignment-1)) & ~(alignment-1);
|
||||
|
||||
int newOffset = int(iptr + size - (int64_t)basePtr);
|
||||
if (newOffset < memBufferSize[curMemBuffer]) {
|
||||
curMemBufferOffset = newOffset;
|
||||
return (char *)iptr;
|
||||
}
|
||||
|
||||
++curMemBuffer;
|
||||
curMemBufferOffset = 0;
|
||||
assert(curMemBuffer < NUM_MEM_BUFFERS);
|
||||
|
||||
int allocSize = 1 << (12 + curMemBuffer);
|
||||
allocSize = std::max(int(size+alignment), allocSize);
|
||||
char *newBuf = new char[allocSize];
|
||||
memBufferSize[curMemBuffer] = allocSize;
|
||||
memBuffers[curMemBuffer] = newBuf;
|
||||
return AllocMemory(size, alignment);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Atomics and the like
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static inline void
|
||||
lMemFence() {
|
||||
__asm__ __volatile__("mfence":::"memory");
|
||||
}
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
|
||||
|
||||
#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
|
||||
#define ISPC_POINTER_BYTES 4
|
||||
#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
|
||||
#define ISPC_POINTER_BYTES 8
|
||||
#else
|
||||
#error "Pointer size unknown!"
|
||||
#endif // __SIZEOF_POINTER__
|
||||
|
||||
|
||||
static void *
|
||||
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedCompareExchangePointer(v, newValue, oldValue);
|
||||
#else
|
||||
void *result;
|
||||
#if (ISPC_POINTER_BYTES == 4)
|
||||
__asm__ __volatile__("lock\ncmpxchgd %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
#else
|
||||
__asm__ __volatile__("lock\ncmpxchgq %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
#endif // ISPC_POINTER_BYTES
|
||||
lMemFence();
|
||||
return result;
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
static int32_t
|
||||
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
|
||||
int32_t result;
|
||||
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
lMemFence();
|
||||
return result;
|
||||
}
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef ISPC_USE_CONCRT
|
||||
// With ConcRT, we don't need to extend TaskGroupBase at all.
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
};
|
||||
#endif // ISPC_USE_CONCRT
|
||||
|
||||
#ifdef ISPC_USE_GCD
|
||||
/* With Grand Central Dispatch, we associate a GCD dispatch group with each
|
||||
task group. (We'll later wait on this dispatch group when we need to
|
||||
wait on all of the tasks in the group to finish.)
|
||||
*/
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
TaskGroup() {
|
||||
gcdGroup = dispatch_group_create();
|
||||
}
|
||||
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
|
||||
private:
|
||||
dispatch_group_t gcdGroup;
|
||||
};
|
||||
#endif // ISPC_USE_GCD
|
||||
|
||||
#ifdef ISPC_USE_PTHREADS
|
||||
static void *lTaskEntry(void *arg);
|
||||
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
TaskGroup() {
|
||||
numUnfinishedTasks = 0;
|
||||
waitingTasks.reserve(128);
|
||||
inActiveList = false;
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
TaskGroupBase::Reset();
|
||||
numUnfinishedTasks = 0;
|
||||
assert(inActiveList == false);
|
||||
lMemFence();
|
||||
}
|
||||
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
|
||||
private:
|
||||
friend void *lTaskEntry(void *arg);
|
||||
|
||||
int32_t numUnfinishedTasks;
|
||||
int32_t pad[3];
|
||||
std::vector<int> waitingTasks;
|
||||
bool inActiveList;
|
||||
};
|
||||
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Grand Central Dispatch
|
||||
|
||||
#ifdef ISPC_USE_GCD
|
||||
|
||||
/* A simple task system for ispc programs based on Apple's Grand Central
|
||||
Dispatch. */
|
||||
|
||||
static dispatch_queue_t gcdQueue;
|
||||
static volatile int32_t lock = 0;
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
if (gcdQueue != NULL)
|
||||
return;
|
||||
|
||||
while (1) {
|
||||
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
||||
if (gcdQueue == NULL) {
|
||||
gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
||||
assert(gcdQueue != NULL);
|
||||
lMemFence();
|
||||
}
|
||||
lock = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lRunTask(void *ti) {
|
||||
TaskInfo *taskInfo = (TaskInfo *)ti;
|
||||
// FIXME: these are bogus values; may cause bugs in code that depends
|
||||
// on them having unique values in different threads.
|
||||
int threadIndex = 0;
|
||||
int threadCount = 1;
|
||||
|
||||
// Actually run the task
|
||||
taskInfo->func(taskInfo->data, threadIndex, threadCount,
|
||||
taskInfo->taskIndex, taskInfo->taskCount);
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||
dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_GCD
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Concurrency Runtime
|
||||
|
||||
#ifdef ISPC_USE_CONCRT
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
// No initialization needed
|
||||
}
|
||||
|
||||
|
||||
static void __cdecl
|
||||
lRunTask(LPVOID param) {
|
||||
TaskInfo *ti = (TaskInfo *)param;
|
||||
|
||||
// Actually run the task.
|
||||
// FIXME: like the GCD implementation for OS X, this is passing bogus
|
||||
// values for the threadIndex and threadCount builtins, which in turn
|
||||
// will cause bugs in code that uses those.
|
||||
int threadIndex = 0;
|
||||
int threadCount = 1;
|
||||
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
|
||||
|
||||
// Signal the event that this task is done
|
||||
ti->taskEvent.set();
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
for (int i = 0; i < count; ++i)
|
||||
CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
for (int i = 0; i < nextTaskInfoIndex; ++i) {
|
||||
TaskInfo *ti = GetTaskInfo(i);
|
||||
ti->taskEvent.wait();
|
||||
ti->taskEvent.reset();
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_CONCRT
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// pthreads
|
||||
|
||||
#ifdef ISPC_USE_PTHREADS
|
||||
|
||||
static volatile int32_t lock = 0;
|
||||
|
||||
static int nThreads;
|
||||
static pthread_t *threads = NULL;
|
||||
|
||||
static pthread_mutex_t taskSysMutex;
|
||||
static std::vector<TaskGroup *> activeTaskGroups;
|
||||
static sem_t *workerSemaphore;
|
||||
|
||||
|
||||
static inline int32_t
|
||||
lAtomicAdd(int32_t *v, int32_t delta) {
|
||||
int32_t origValue;
|
||||
__asm__ __volatile__("lock\n"
|
||||
"xaddl %0,%1"
|
||||
: "=r"(origValue), "=m"(*v) : "0"(delta)
|
||||
: "memory");
|
||||
return origValue;
|
||||
}
|
||||
|
||||
|
||||
static void *
|
||||
lTaskEntry(void *arg) {
|
||||
int threadIndex = (int)((int64_t)arg);
|
||||
int threadCount = nThreads;
|
||||
|
||||
while (1) {
|
||||
int err;
|
||||
//
|
||||
// Wait on the semaphore until we're woken up due to the arrival of
|
||||
// more work.
|
||||
//
|
||||
if ((err = sem_wait(workerSemaphore)) != 0) {
|
||||
fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Acquire the mutex
|
||||
//
|
||||
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (activeTaskGroups.size() == 0) {
|
||||
//
|
||||
// Task queue is empty, go back and wait on the semaphore
|
||||
//
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
//
|
||||
// Get the last task group on the active list and the last task
|
||||
// from its waiting tasks list.
|
||||
//
|
||||
TaskGroup *tg = activeTaskGroups.back();
|
||||
assert(tg->waitingTasks.size() > 0);
|
||||
int taskNumber = tg->waitingTasks.back();
|
||||
tg->waitingTasks.pop_back();
|
||||
|
||||
if (tg->waitingTasks.size() == 0) {
|
||||
// We just took the last task from this task group, so remove
|
||||
// it from the active list.
|
||||
activeTaskGroups.pop_back();
|
||||
tg->inActiveList = false;
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// And now actually run the task
|
||||
//
|
||||
DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
|
||||
TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
|
||||
myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
|
||||
myTask->taskCount);
|
||||
|
||||
//
|
||||
// Decrement the "number of unfinished tasks" counter in the task
|
||||
// group.
|
||||
//
|
||||
lMemFence();
|
||||
lAtomicAdd(&tg->numUnfinishedTasks, -1);
|
||||
}
|
||||
|
||||
pthread_exit(NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
if (threads == NULL) {
|
||||
while (1) {
|
||||
if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
|
||||
if (threads == NULL) {
|
||||
// We launch one fewer thread than there are cores,
|
||||
// since the main thread here will also grab jobs from
|
||||
// the task queue itself.
|
||||
nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
|
||||
|
||||
int err;
|
||||
if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {
|
||||
fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
char name[32];
|
||||
sprintf(name, "ispc_task.%d", (int)getpid());
|
||||
workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
|
||||
if (!workerSemaphore) {
|
||||
fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
|
||||
for (int i = 0; i < nThreads; ++i) {
|
||||
err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
activeTaskGroups.reserve(64);
|
||||
}
|
||||
|
||||
// Make sure all of the above goes to memory before we
|
||||
// clear the lock.
|
||||
lMemFence();
|
||||
lock = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseCoord, int count) {
|
||||
//
|
||||
// Acquire mutex, add task
|
||||
//
|
||||
int err;
|
||||
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Add the corresponding set of tasks to the waiting-to-be-run list for
|
||||
// this task group.
|
||||
//
|
||||
// FIXME: it's a little ugly to hold a global mutex for this when we
|
||||
// only need to make sure no one else is accessing this task group's
|
||||
// waitingTasks list. (But a small experiment in switching to a
|
||||
// per-TaskGroup mutex showed worse performance!)
|
||||
for (int i = 0; i < count; ++i)
|
||||
waitingTasks.push_back(baseCoord + i);
|
||||
|
||||
// Add the task group to the global active list if it isn't there
|
||||
// already.
|
||||
if (inActiveList == false) {
|
||||
activeTaskGroups.push_back(this);
|
||||
inActiveList = true;
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Update the count of the number of tasks left to run in this task
|
||||
// group.
|
||||
//
|
||||
lMemFence();
|
||||
lAtomicAdd(&numUnfinishedTasks, count);
|
||||
|
||||
//
|
||||
// Post to the worker semaphore to wake up worker threads that are
|
||||
// sleeping waiting for tasks to show up
|
||||
//
|
||||
for (int i = 0; i < count; ++i)
|
||||
if ((err = sem_post(workerSemaphore)) != 0) {
|
||||
fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks));
|
||||
|
||||
while (numUnfinishedTasks > 0) {
|
||||
// All of the tasks in this group aren't finished yet. We'll try
|
||||
// to help out here since we don't have anything else to do...
|
||||
|
||||
DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg,
|
||||
numUnfinishedTasks));
|
||||
|
||||
//
|
||||
// Acquire the global task system mutex to grab a task to work on
|
||||
//
|
||||
int err;
|
||||
if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
TaskInfo *myTask = NULL;
|
||||
TaskGroup *runtg = this;
|
||||
if (waitingTasks.size() > 0) {
|
||||
int taskNumber = waitingTasks.back();
|
||||
waitingTasks.pop_back();
|
||||
|
||||
if (waitingTasks.size() == 0) {
|
||||
// There's nothing left to start running from this group,
|
||||
// so remove it from the active task list.
|
||||
activeTaskGroups.erase(std::find(activeTaskGroups.begin(),
|
||||
activeTaskGroups.end(), this));
|
||||
inActiveList = false;
|
||||
}
|
||||
myTask = GetTaskInfo(taskNumber);
|
||||
DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg));
|
||||
}
|
||||
else {
|
||||
// Other threads are already working on all of the tasks in
|
||||
// this group, so we can't help out by running one ourself.
|
||||
// We'll try to run one from another group to make ourselves
|
||||
// useful here.
|
||||
if (activeTaskGroups.size() == 0) {
|
||||
// No active task groups left--there's nothing for us to do.
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
// FIXME: We basically end up busy-waiting here, which is
|
||||
// extra wasteful in a world with hyperthreading. It would
|
||||
// be much better to put this thread to sleep on a
|
||||
// condition variable that was signaled when the last task
|
||||
// in this group was finished.
|
||||
sleep(0);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get a task to run from another task group.
|
||||
runtg = activeTaskGroups.back();
|
||||
assert(runtg->waitingTasks.size() > 0);
|
||||
|
||||
int taskNumber = runtg->waitingTasks.back();
|
||||
runtg->waitingTasks.pop_back();
|
||||
if (runtg->waitingTasks.size() == 0) {
|
||||
// There's left to start running from this group, so remove
|
||||
// it from the active task list.
|
||||
activeTaskGroups.pop_back();
|
||||
runtg->inActiveList = false;
|
||||
}
|
||||
myTask = runtg->GetTaskInfo(taskNumber);
|
||||
DBG(fprintf(stderr, "running task %d from other group %p in sync\n",
|
||||
taskNumber, runtg));
|
||||
}
|
||||
|
||||
if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
|
||||
fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//
|
||||
// Do work for _myTask_
|
||||
//
|
||||
// FIXME: bogus values for thread index/thread count here as well..
|
||||
myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
|
||||
|
||||
//
|
||||
// Decrement the number of unfinished tasks counter
|
||||
//
|
||||
lMemFence();
|
||||
lAtomicAdd(&runtg->numUnfinishedTasks, -1);
|
||||
}
|
||||
DBG(fprintf(stderr, "sync for %p done!n", tg));
|
||||
}
|
||||
|
||||
#endif // ISPC_USE_PTHREADS
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define MAX_FREE_TASK_GROUPS 64
|
||||
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
|
||||
|
||||
static inline TaskGroup *
|
||||
AllocTaskGroup() {
|
||||
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||
TaskGroup *tg = freeTaskGroups[i];
|
||||
if (tg != NULL) {
|
||||
void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
|
||||
if (ptr != NULL) {
|
||||
assert(ptr == tg);
|
||||
return (TaskGroup *)ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new TaskGroup;
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
FreeTaskGroup(TaskGroup *tg) {
|
||||
tg->Reset();
|
||||
|
||||
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||
if (freeTaskGroups[i] == NULL) {
|
||||
void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
|
||||
if (ptr == NULL)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
delete tg;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
void ISPCSync(void *handle);
|
||||
}
|
||||
|
||||
void
|
||||
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
|
||||
TaskGroup *taskGroup;
|
||||
if (*taskGroupPtr == NULL) {
|
||||
InitTaskSystem();
|
||||
taskGroup = AllocTaskGroup();
|
||||
*taskGroupPtr = taskGroup;
|
||||
}
|
||||
else
|
||||
taskGroup = (TaskGroup *)(*taskGroupPtr);
|
||||
|
||||
int baseIndex = taskGroup->AllocTaskInfo(count);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
|
||||
ti->func = (TaskFuncType)func;
|
||||
ti->data = data;
|
||||
ti->taskIndex = i;
|
||||
ti->taskCount = count;
|
||||
}
|
||||
taskGroup->Launch(baseIndex, count);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ISPCSync(void *h) {
|
||||
TaskGroup *taskGroup = (TaskGroup *)h;
|
||||
if (taskGroup != NULL) {
|
||||
taskGroup->Sync();
|
||||
FreeTaskGroup(taskGroup);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void *
|
||||
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
|
||||
TaskGroup *taskGroup;
|
||||
if (*taskGroupPtr == NULL) {
|
||||
InitTaskSystem();
|
||||
taskGroup = AllocTaskGroup();
|
||||
*taskGroupPtr = taskGroup;
|
||||
}
|
||||
else
|
||||
taskGroup = (TaskGroup *)(*taskGroupPtr);
|
||||
|
||||
return taskGroup->AllocMemory(size, alignment);
|
||||
}
|
||||
@@ -1,20 +1,17 @@
|
||||
|
||||
ARCH = $(shell uname)
|
||||
|
||||
TASK_CXX=../tasks_pthreads.cpp
|
||||
TASK_CXX=../tasksys.cpp
|
||||
TASK_LIB=-lpthread
|
||||
|
||||
ifeq ($(ARCH), Darwin)
|
||||
TASK_CXX=../tasks_gcd.cpp
|
||||
TASK_LIB=
|
||||
endif
|
||||
|
||||
TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
|
||||
ISPC=ispc
|
||||
ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
|
||||
ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
|
||||
|
||||
OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
|
||||
objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
|
||||
|
||||
default: volume
|
||||
|
||||
@@ -26,8 +23,8 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ volume
|
||||
|
||||
volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
|
||||
volume: dirs $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
|
||||
|
||||
objs/%.o: %.cpp
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
@@ -37,5 +34,5 @@ objs/%.o: ../%.cpp
|
||||
|
||||
objs/volume.o: objs/volume_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "../cpuid.h"
|
||||
#include "volume_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
@@ -70,37 +69,6 @@ writePPM(float *buf, int width, int height, const char *fn) {
|
||||
}
|
||||
|
||||
|
||||
// Make sure that the vector ISA used during compilation is supported by
|
||||
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||
// header file that we include above.
|
||||
static void
|
||||
ensureTargetISAIsSupported() {
|
||||
#if defined(ISPC_TARGET_SSE2)
|
||||
bool isaSupported = CPUSupportsSSE2();
|
||||
const char *target = "SSE2";
|
||||
#elif defined(ISPC_TARGET_SSE4)
|
||||
bool isaSupported = CPUSupportsSSE4();
|
||||
const char *target = "SSE4";
|
||||
#elif defined(ISPC_TARGET_AVX)
|
||||
bool isaSupported = CPUSupportsAVX();
|
||||
const char *target = "AVX";
|
||||
#else
|
||||
#error "Unknown ISPC_TARGET_* value"
|
||||
#endif
|
||||
if (!isaSupported) {
|
||||
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||
fprintf(stderr, "***\n*** Please modify the "
|
||||
#ifdef _MSC_VER
|
||||
"MSVC project file "
|
||||
#else
|
||||
"Makefile "
|
||||
#endif
|
||||
"to select another target (e.g. sse2)\n***\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Load image and viewing parameters from a camera data file.
|
||||
FIXME: we should add support to be able to specify viewing parameters
|
||||
in the program here directly. */
|
||||
@@ -172,8 +140,6 @@ int main(int argc, char *argv[]) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
ensureTargetISAIsSupported();
|
||||
|
||||
//
|
||||
// Load viewing data and the volume density data
|
||||
//
|
||||
|
||||
@@ -343,11 +343,20 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
|
||||
|
||||
task void
|
||||
volume_task(uniform int x0, uniform int y0, uniform int x1,
|
||||
uniform int y1, uniform float density[], uniform int nVoxels[3],
|
||||
volume_task(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
|
||||
uniform int xbuckets = (width + (dx-1)) / dx;
|
||||
uniform int ybuckets = (height + (dy-1)) / dy;
|
||||
|
||||
uniform int x0 = (taskIndex % xbuckets) * dx;
|
||||
uniform int y0 = (taskIndex / xbuckets) * dy;
|
||||
uniform int x1 = x0 + dx, y1 = y0 + dy;
|
||||
x1 = min(x1, width);
|
||||
y1 = min(y1, height);
|
||||
|
||||
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
@@ -370,9 +379,7 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||
uniform int dx = 8, dy = 8;
|
||||
for (uniform int y = 0; y < height; y += dy)
|
||||
for (uniform int x = 0; x < width; x += dx)
|
||||
launch < volume_task(x, y, x+dx, y+dy, density, nVoxels,
|
||||
raster2camera, camera2world, width, height,
|
||||
image) >;
|
||||
uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||
launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image) >;
|
||||
}
|
||||
|
||||
28
examples/volume_rendering/volume.vcxproj
Executable file → Normal file
28
examples/volume_rendering/volume.vcxproj
Executable file → Normal file
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -64,15 +64,19 @@
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
@@ -81,6 +85,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -96,6 +101,7 @@
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
@@ -113,6 +119,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -131,6 +138,7 @@
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
@@ -143,23 +151,23 @@
|
||||
<ItemGroup>
|
||||
<ClCompile Include="volume.cpp" />
|
||||
<ClCompile Include="volume_serial.cpp" />
|
||||
<ClCompile Include="../tasks_concrt.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="volume.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -36,9 +36,6 @@
|
||||
#include <algorithm>
|
||||
|
||||
// Just enough of a float3 class to do what we need in this file.
|
||||
#ifdef _MSC_VER
|
||||
__declspec(align(16))
|
||||
#endif
|
||||
struct float3 {
|
||||
float3() { }
|
||||
float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
|
||||
@@ -298,7 +295,7 @@ volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
|
||||
for (int y = 0; y < height; ++y) {
|
||||
for (int x = 0; x < width; ++x, ++offset) {
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x, y, ray);
|
||||
generateRay(raster2camera, camera2world, (float)x, (float)y, ray);
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
}
|
||||
|
||||
118
expr.cpp
118
expr.cpp
@@ -1189,10 +1189,10 @@ BinaryExpr::Optimize() {
|
||||
m->symbolTable->LookupFunction("rcp");
|
||||
if (rcpFuns != NULL) {
|
||||
assert(rcpFuns->size() == 2);
|
||||
Expr *rcpSymExpr = new FunctionSymbolExpr(rcpFuns, pos);
|
||||
Expr *rcpSymExpr = new FunctionSymbolExpr("rcp", rcpFuns, pos);
|
||||
ExprList *args = new ExprList(arg1, arg1->pos);
|
||||
Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args,
|
||||
arg1->pos, false);
|
||||
arg1->pos);
|
||||
rcpCall = rcpCall->TypeCheck();
|
||||
if (rcpCall == NULL)
|
||||
return NULL;
|
||||
@@ -1305,6 +1305,17 @@ BinaryExpr::TypeCheck() {
|
||||
if (type0 == NULL || type1 == NULL)
|
||||
return NULL;
|
||||
|
||||
if (dynamic_cast<const ReferenceType *>(type0) != NULL) {
|
||||
arg0 = new DereferenceExpr(arg0, arg0->pos);
|
||||
type0 = arg0->GetType();
|
||||
assert(type0 != NULL);
|
||||
}
|
||||
if (dynamic_cast<const ReferenceType *>(type1) != NULL) {
|
||||
arg1 = new DereferenceExpr(arg1, arg1->pos);
|
||||
type1 = arg1->GetType();
|
||||
assert(type1 != NULL);
|
||||
}
|
||||
|
||||
switch (op) {
|
||||
case Shl:
|
||||
case Shr:
|
||||
@@ -1486,7 +1497,7 @@ lStoreAssignResult(llvm::Value *rv, llvm::Value *lv, const Type *type,
|
||||
assert(baseSym->varyingCFDepth <= ctx->VaryingCFDepth());
|
||||
if (!g->opt.disableMaskedStoreToStore &&
|
||||
baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
|
||||
baseSym->isStatic == false &&
|
||||
baseSym->storageClass != SC_STATIC &&
|
||||
dynamic_cast<const ReferenceType *>(baseSym->type) == NULL) {
|
||||
// If the variable is declared at the same varying control flow
|
||||
// depth as where it's being assigned, then we don't need to do any
|
||||
@@ -2202,7 +2213,7 @@ FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) {
|
||||
|
||||
|
||||
void
|
||||
FunctionCallExpr::resolveFunctionOverloads() {
|
||||
FunctionCallExpr::resolveFunctionOverloads(bool exactMatchOnly) {
|
||||
FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
|
||||
if (!fse)
|
||||
// error will be issued later if not calling an actual function
|
||||
@@ -2216,44 +2227,55 @@ FunctionCallExpr::resolveFunctionOverloads() {
|
||||
if (tryResolve(lExactMatch))
|
||||
return;
|
||||
|
||||
// Try to find a single match ignoring references
|
||||
if (tryResolve(lMatchIgnoringReferences))
|
||||
return;
|
||||
if (!exactMatchOnly) {
|
||||
// Try to find a single match ignoring references
|
||||
if (tryResolve(lMatchIgnoringReferences))
|
||||
return;
|
||||
|
||||
// TODO: next, try to find an exact match via type promotion--i.e. char
|
||||
// -> int, etc--things that don't lose data
|
||||
// TODO: next, try to find an exact match via type promotion--i.e. char
|
||||
// -> int, etc--things that don't lose data
|
||||
|
||||
// Next try to see if there's a match via just uniform -> varying
|
||||
// promotions. TODO: look for one with a minimal number of them?
|
||||
if (tryResolve(lMatchIgnoringUniform))
|
||||
return;
|
||||
// Next try to see if there's a match via just uniform -> varying
|
||||
// promotions. TODO: look for one with a minimal number of them?
|
||||
if (tryResolve(lMatchIgnoringUniform))
|
||||
return;
|
||||
|
||||
// Try to find a match via type conversion, but don't change
|
||||
// unif->varying
|
||||
if (tryResolve(lMatchWithTypeConvSameVariability))
|
||||
return;
|
||||
// Try to find a match via type conversion, but don't change
|
||||
// unif->varying
|
||||
if (tryResolve(lMatchWithTypeConvSameVariability))
|
||||
return;
|
||||
|
||||
// Last chance: try to find a match via arbitrary type conversion.
|
||||
if (tryResolve(lMatchWithTypeConv))
|
||||
return;
|
||||
// Last chance: try to find a match via arbitrary type conversion.
|
||||
if (tryResolve(lMatchWithTypeConv))
|
||||
return;
|
||||
}
|
||||
|
||||
// failure :-(
|
||||
const char *funName = fse->candidateFunctions->front()->name.c_str();
|
||||
Error(pos, "Unable to find matching overload for call to function \"%s\".",
|
||||
funName);
|
||||
Error(pos, "Unable to find matching overload for call to function \"%s\"%s.",
|
||||
funName, exactMatchOnly ? " only considering exact matches" : "");
|
||||
fprintf(stderr, "Candidates are:\n");
|
||||
lPrintFunctionOverloads(*fse->candidateFunctions);
|
||||
lPrintPassedTypes(funName, args->exprs);
|
||||
}
|
||||
|
||||
|
||||
FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il)
|
||||
: Expr(p) {
|
||||
FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p,
|
||||
bool il, Expr *lce)
|
||||
: Expr(p), isLaunch(il) {
|
||||
func = f;
|
||||
args = a;
|
||||
isLaunch = il;
|
||||
launchCountExpr = lce;
|
||||
|
||||
resolveFunctionOverloads();
|
||||
FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
|
||||
// Functions with names that start with "__" should only be various
|
||||
// builtins. For those, we'll demand an exact match, since we'll
|
||||
// expect whichever function in stdlib.ispc is calling out to one of
|
||||
// those to be matching the argument types exactly; this is to be a bit
|
||||
// extra safe to be sure that the expected builtin is in fact being
|
||||
// called.
|
||||
bool exactMatchOnly = (fse != NULL) && (fse->name.substr(0,2) == "__");
|
||||
resolveFunctionOverloads(exactMatchOnly);
|
||||
}
|
||||
|
||||
|
||||
@@ -2379,8 +2401,12 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
|
||||
|
||||
llvm::Value *retVal = NULL;
|
||||
ctx->SetDebugPos(pos);
|
||||
if (ft->isTask)
|
||||
ctx->LaunchInst(callee, argVals);
|
||||
if (ft->isTask) {
|
||||
assert(launchCountExpr != NULL);
|
||||
llvm::Value *launchCount = launchCountExpr->GetValue(ctx);
|
||||
if (launchCount != NULL)
|
||||
ctx->LaunchInst(callee, argVals, launchCount);
|
||||
}
|
||||
else {
|
||||
// Most of the time, the mask is passed as the last argument. this
|
||||
// isn't the case for things like intrinsics, builtins, and extern
|
||||
@@ -2456,10 +2482,21 @@ FunctionCallExpr::TypeCheck() {
|
||||
if (!isLaunch)
|
||||
Error(pos, "\"launch\" expression needed to call function "
|
||||
"with \"task\" qualifier.");
|
||||
if (!launchCountExpr)
|
||||
return NULL;
|
||||
|
||||
launchCountExpr =
|
||||
launchCountExpr->TypeConv(AtomicType::UniformInt32,
|
||||
"task launch count");
|
||||
if (!launchCountExpr)
|
||||
return NULL;
|
||||
}
|
||||
else {
|
||||
if (isLaunch)
|
||||
Error(pos, "\"launch\" expression illegal with non-\"task\"-"
|
||||
"qualified function.");
|
||||
assert(launchCountExpr == NULL);
|
||||
}
|
||||
else if (isLaunch)
|
||||
Error(pos, "\"launch\" expression illegal with non-\"task\"-"
|
||||
"qualified function.");
|
||||
}
|
||||
else
|
||||
Error(pos, "Valid function name must be used for function call.");
|
||||
@@ -4103,7 +4140,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
|
||||
case AtomicType::TYPE_BOOL:
|
||||
if (fromType->IsVaryingType() &&
|
||||
LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
|
||||
// If we have a bool vector of i32 element,s first truncate
|
||||
// If we have a bool vector of i32 elements, first truncate
|
||||
// down to a single bit
|
||||
exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
|
||||
// And then do an unisgned int->float cast
|
||||
@@ -4163,9 +4200,6 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
|
||||
case AtomicType::TYPE_UINT16:
|
||||
case AtomicType::TYPE_UINT32:
|
||||
case AtomicType::TYPE_UINT64:
|
||||
if (fromType->IsVaryingType())
|
||||
PerformanceWarning(pos, "Conversion from unsigned int64 to float is slow. "
|
||||
"Use \"int64\" if possible");
|
||||
cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
|
||||
exprVal, targetType, "uint2double");
|
||||
break;
|
||||
@@ -5193,9 +5227,11 @@ SymbolExpr::Print() const {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// FunctionSymbolExpr
|
||||
|
||||
FunctionSymbolExpr::FunctionSymbolExpr(std::vector<Symbol *> *candidates,
|
||||
FunctionSymbolExpr::FunctionSymbolExpr(const char *n,
|
||||
std::vector<Symbol *> *candidates,
|
||||
SourcePos p)
|
||||
: Expr(p) {
|
||||
name = n;
|
||||
matchingFunc = NULL;
|
||||
candidateFunctions = candidates;
|
||||
}
|
||||
@@ -5261,14 +5297,8 @@ SyncExpr::GetType() const {
|
||||
llvm::Value *
|
||||
SyncExpr::GetValue(FunctionEmitContext *ctx) const {
|
||||
ctx->SetDebugPos(pos);
|
||||
std::vector<llvm::Value *> noArg;
|
||||
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||
if (fsync == NULL) {
|
||||
FATAL("Couldn't find ISPCSync declaration?!");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ctx->CallInst(fsync, noArg, "");
|
||||
ctx->SyncInst();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
12
expr.h
12
expr.h
@@ -39,6 +39,7 @@
|
||||
#define ISPC_EXPR_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include "ast.h"
|
||||
#include "type.h"
|
||||
|
||||
class FunctionSymbolExpr;
|
||||
@@ -250,7 +251,8 @@ public:
|
||||
*/
|
||||
class FunctionCallExpr : public Expr {
|
||||
public:
|
||||
FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
|
||||
FunctionCallExpr(Expr *func, ExprList *args, SourcePos p,
|
||||
bool isLaunch = false, Expr *launchCountExpr = NULL);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
@@ -263,9 +265,10 @@ public:
|
||||
Expr *func;
|
||||
ExprList *args;
|
||||
bool isLaunch;
|
||||
Expr *launchCountExpr;
|
||||
|
||||
private:
|
||||
void resolveFunctionOverloads();
|
||||
void resolveFunctionOverloads(bool exactMatchOnly);
|
||||
bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
|
||||
};
|
||||
|
||||
@@ -567,7 +570,7 @@ private:
|
||||
*/
|
||||
class FunctionSymbolExpr : public Expr {
|
||||
public:
|
||||
FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions,
|
||||
FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
|
||||
SourcePos pos);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
@@ -581,6 +584,9 @@ public:
|
||||
private:
|
||||
friend class FunctionCallExpr;
|
||||
|
||||
/** Name of the function that is being called. */
|
||||
std::string name;
|
||||
|
||||
/** All of the functions with the name given in the function call;
|
||||
there may be more then one, in which case we need to resolve which
|
||||
overload is the best match. */
|
||||
|
||||
643
func.cpp
Normal file
643
func.cpp
Normal file
@@ -0,0 +1,643 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file func.cpp
|
||||
@brief
|
||||
*/
|
||||
|
||||
#include "func.h"
|
||||
#include "ctx.h"
|
||||
#include "decl.h"
|
||||
#include "expr.h"
|
||||
#include "llvmutil.h"
|
||||
#include "module.h"
|
||||
#include "type.h"
|
||||
#include "stmt.h"
|
||||
#include "sym.h"
|
||||
#include "util.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/PassManager.h>
|
||||
#include <llvm/PassRegistry.h>
|
||||
#include <llvm/Transforms/IPO.h>
|
||||
#include <llvm/Support/FormattedStream.h>
|
||||
#include <llvm/Support/FileUtilities.h>
|
||||
#include <llvm/Target/TargetMachine.h>
|
||||
#include <llvm/Target/TargetOptions.h>
|
||||
#include <llvm/Target/TargetData.h>
|
||||
#include <llvm/PassManager.h>
|
||||
#include <llvm/Analysis/Verifier.h>
|
||||
#include <llvm/Support/CFG.h>
|
||||
#include <llvm/Support/ToolOutputFile.h>
|
||||
#include <llvm/Assembly/PrintModulePass.h>
|
||||
|
||||
Function::Function(DeclSpecs *ds, Declarator *decl, Stmt *c) {
|
||||
code = c;
|
||||
|
||||
maskSymbol = m->symbolTable->LookupVariable("__mask");
|
||||
assert(maskSymbol != NULL);
|
||||
|
||||
if (code) {
|
||||
code = code->TypeCheck();
|
||||
if (code)
|
||||
code = code->Optimize();
|
||||
}
|
||||
|
||||
if (g->debugPrint) {
|
||||
printf("Add Function\n");
|
||||
ds->Print();
|
||||
printf("\n");
|
||||
decl->Print();
|
||||
printf("\n");
|
||||
code->Print(0);
|
||||
printf("\n\n\n");
|
||||
}
|
||||
|
||||
// Get the symbol for the function from the symbol table. (It should
|
||||
// already have been added to the symbol table by AddGlobal() by the
|
||||
// time we get here.)
|
||||
type = dynamic_cast<const FunctionType *>(decl->GetType(ds));
|
||||
assert(type != NULL);
|
||||
sym = m->symbolTable->LookupFunction(decl->sym->name.c_str(), type);
|
||||
assert(sym != NULL);
|
||||
sym->pos = decl->pos;
|
||||
|
||||
isExported = (ds->storageClass == SC_EXPORT);
|
||||
|
||||
if (decl->functionArgs != NULL) {
|
||||
for (unsigned int i = 0; i < decl->functionArgs->size(); ++i) {
|
||||
Declaration *pdecl = (*decl->functionArgs)[i];
|
||||
assert(pdecl->declarators.size() == 1);
|
||||
args.push_back(pdecl->declarators[0]->sym);
|
||||
}
|
||||
}
|
||||
|
||||
if (type->isTask) {
|
||||
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
|
||||
assert(threadIndexSym);
|
||||
threadCountSym = m->symbolTable->LookupVariable("threadCount");
|
||||
assert(threadCountSym);
|
||||
taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
|
||||
assert(taskIndexSym);
|
||||
taskCountSym = m->symbolTable->LookupVariable("taskCount");
|
||||
assert(taskCountSym);
|
||||
}
|
||||
else
|
||||
threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
|
||||
}
|
||||
|
||||
|
||||
/** Given an arbitrary type, see if it or any of the types contained in it
|
||||
are varying. Returns true if so, false otherwise.
|
||||
*/
|
||||
static bool
|
||||
lRecursiveCheckVarying(const Type *t) {
|
||||
t = t->GetBaseType();
|
||||
if (t->IsVaryingType()) return true;
|
||||
|
||||
const StructType *st = dynamic_cast<const StructType *>(t);
|
||||
if (st) {
|
||||
for (int i = 0; i < st->GetElementCount(); ++i)
|
||||
if (lRecursiveCheckVarying(st->GetElementType(i)))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/** Given a Symbol representing a function parameter, see if it or any
|
||||
contained types are varying. If so, issue an error. (This function
|
||||
should only be called for parameters to 'export'ed functions, where
|
||||
varying parameters is illegal.
|
||||
*/
|
||||
static void
|
||||
lCheckForVaryingParameter(Symbol *sym) {
|
||||
if (lRecursiveCheckVarying(sym->type)) {
|
||||
const Type *t = sym->type->GetBaseType();
|
||||
if (dynamic_cast<const StructType *>(t))
|
||||
Error(sym->pos, "Struct parameter \"%s\" with varying member(s) is illegal "
|
||||
"in an exported function.",
|
||||
sym->name.c_str());
|
||||
else
|
||||
Error(sym->pos, "Varying parameter \"%s\" is illegal in an exported function.",
|
||||
sym->name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Given a function type, loop through the function parameters and see if
|
||||
any are StructTypes. If so, issue an error (this seems to be broken
|
||||
currently).
|
||||
|
||||
@todo Fix passing structs from C/C++ to ispc functions.
|
||||
*/
|
||||
static void
|
||||
lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
|
||||
const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
|
||||
for (unsigned int i = 0; i < argTypes.size(); ++i) {
|
||||
const Type *type = argTypes[i];
|
||||
if (dynamic_cast<const StructType *>(type) != NULL) {
|
||||
Error(pos, "Passing structs to/from application functions is currently broken. "
|
||||
"Use a reference or const reference instead for now.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** We've got a declaration for a function to process. This function does
|
||||
all the work of creating the corresponding llvm::Function instance,
|
||||
adding the symbol for the function to the symbol table and doing
|
||||
various sanity checks. This function returns true upon success and
|
||||
false if any errors were encountered.
|
||||
*/
|
||||
Symbol *
|
||||
Function::InitFunctionSymbol(DeclSpecs *ds, Declarator *decl) {
|
||||
// Make sure that we've got what we expect here
|
||||
Symbol *funSym = decl->sym;
|
||||
assert(decl->isFunction);
|
||||
assert(decl->arraySize.size() == 0);
|
||||
|
||||
// So far, so good. Go ahead and set the type of the function symbol
|
||||
funSym->type = decl->GetType(ds);
|
||||
|
||||
// If a global variable with the same name has already been declared
|
||||
// issue an error.
|
||||
if (m->symbolTable->LookupVariable(funSym->name.c_str()) != NULL) {
|
||||
Error(decl->pos, "Function \"%s\" shadows previously-declared global variable. "
|
||||
"Ignoring this definition.",
|
||||
funSym->name.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (ds->storageClass == SC_EXTERN_C) {
|
||||
// Make sure the user hasn't supplied both an 'extern "C"' and a
|
||||
// 'task' qualifier with the function
|
||||
if (ds->typeQualifier & TYPEQUAL_TASK) {
|
||||
Error(funSym->pos, "\"task\" qualifier is illegal with C-linkage extern "
|
||||
"function \"%s\". Ignoring this function.", funSym->name.c_str());
|
||||
return NULL;
|
||||
}
|
||||
std::vector<Symbol *> *funcs;
|
||||
funcs = m->symbolTable->LookupFunction(decl->sym->name.c_str());
|
||||
if (funcs != NULL) {
|
||||
if (funcs->size() > 1) {
|
||||
// Multiple functions with this name have already been declared;
|
||||
// can't overload here
|
||||
Error(funSym->pos, "Can't overload extern \"C\" function \"%s\"; "
|
||||
"%d functions with the same name have already been declared.",
|
||||
funSym->name.c_str(), (int)funcs->size());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// One function with the same name has been declared; see if it
|
||||
// has the same type as this one, in which case it's ok.
|
||||
if (Type::Equal((*funcs)[0]->type, funSym->type))
|
||||
return (*funcs)[0];
|
||||
else {
|
||||
Error(funSym->pos, "Can't overload extern \"C\" function \"%s\".",
|
||||
funSym->name.c_str());
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We should have gotten a FunctionType back from the GetType() call above.
|
||||
const FunctionType *functionType =
|
||||
dynamic_cast<const FunctionType *>(funSym->type);
|
||||
assert(functionType != NULL);
|
||||
|
||||
// Get the LLVM FunctionType
|
||||
bool includeMask = (ds->storageClass != SC_EXTERN_C);
|
||||
LLVM_TYPE_CONST llvm::FunctionType *llvmFunctionType =
|
||||
functionType->LLVMFunctionType(g->ctx, includeMask);
|
||||
if (llvmFunctionType == NULL)
|
||||
return NULL;
|
||||
|
||||
// And create the llvm::Function
|
||||
llvm::GlobalValue::LinkageTypes linkage = ds->storageClass == SC_STATIC ?
|
||||
llvm::GlobalValue::InternalLinkage : llvm::GlobalValue::ExternalLinkage;
|
||||
std::string functionName = ((ds->storageClass == SC_EXTERN_C) ?
|
||||
funSym->name : funSym->MangledName());
|
||||
if (g->mangleFunctionsWithTarget)
|
||||
functionName += g->target.GetISAString();
|
||||
llvm::Function *function =
|
||||
llvm::Function::Create(llvmFunctionType, linkage, functionName.c_str(), m->module);
|
||||
|
||||
// Set function attributes: we never throw exceptions, and want to
|
||||
// inline everything we can
|
||||
function->setDoesNotThrow(true);
|
||||
if (!(ds->storageClass == SC_EXTERN_C) && !g->generateDebuggingSymbols &&
|
||||
(ds->typeQualifier & TYPEQUAL_INLINE))
|
||||
function->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
if (functionType->isTask)
|
||||
// This also applies transitively to members I think?
|
||||
function->setDoesNotAlias(1, true);
|
||||
|
||||
// Make sure that the return type isn't 'varying' if the function is
|
||||
// 'export'ed.
|
||||
if (ds->storageClass == SC_EXPORT &&
|
||||
lRecursiveCheckVarying(functionType->GetReturnType()))
|
||||
Error(decl->pos, "Illegal to return a \"varying\" type from exported function \"%s\"",
|
||||
funSym->name.c_str());
|
||||
|
||||
if (functionType->isTask && (functionType->GetReturnType() != AtomicType::Void))
|
||||
Error(funSym->pos, "Task-qualified functions must have void return type.");
|
||||
|
||||
if (functionType->isExported || functionType->isExternC)
|
||||
lCheckForStructParameters(functionType, funSym->pos);
|
||||
|
||||
// Loop over all of the arguments; process default values if present
|
||||
// and do other checks and parameter attribute setting.
|
||||
bool seenDefaultArg = false;
|
||||
std::vector<ConstExpr *> argDefaults;
|
||||
int nArgs = decl->functionArgs ? decl->functionArgs->size() : 0;
|
||||
for (int i = 0; i < nArgs; ++i) {
|
||||
Declaration *pdecl = (*decl->functionArgs)[i];
|
||||
assert(pdecl->declarators.size() == 1);
|
||||
Symbol *sym = pdecl->declarators[0]->sym;
|
||||
|
||||
// If the function is exported, make sure that the parameter
|
||||
// doesn't have any varying stuff going on in it.
|
||||
if (ds->storageClass == SC_EXPORT)
|
||||
lCheckForVaryingParameter(sym);
|
||||
|
||||
// ISPC assumes that all memory passed in is aligned to the native
|
||||
// width and that no pointers alias. (It should be possible to
|
||||
// specify when this is not the case, but this should be the
|
||||
// default.) Set parameter attributes accordingly.
|
||||
if (!functionType->isTask && dynamic_cast<const ReferenceType *>(sym->type) != NULL) {
|
||||
// NOTE: LLVM indexes function parameters starting from 1.
|
||||
// This is unintuitive.
|
||||
function->setDoesNotAlias(i+1, true);
|
||||
int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
|
||||
function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
|
||||
}
|
||||
|
||||
if (m->symbolTable->LookupFunction(sym->name.c_str()) != NULL)
|
||||
Warning(sym->pos, "Function parameter \"%s\" shadows a function "
|
||||
"declared in global scope.", sym->name.c_str());
|
||||
|
||||
// See if a default argument value was provided with the parameter
|
||||
Expr *defaultValue = pdecl->declarators[0]->initExpr;
|
||||
if (defaultValue != NULL) {
|
||||
// If we have one, make sure it's a compile-time constant
|
||||
seenDefaultArg = true;
|
||||
defaultValue = defaultValue->TypeCheck();
|
||||
defaultValue = defaultValue->Optimize();
|
||||
defaultValue = dynamic_cast<ConstExpr *>(defaultValue);
|
||||
if (!defaultValue) {
|
||||
Error(sym->pos, "Default value for parameter \"%s\" must be "
|
||||
"a compile-time constant.", sym->name.c_str());
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else if (seenDefaultArg) {
|
||||
// Once one parameter has provided a default value, then all of
|
||||
// the following ones must have them as well.
|
||||
Error(sym->pos, "Parameter \"%s\" is missing default: all parameters after "
|
||||
"the first parameter with a default value must have default values "
|
||||
"as well.", sym->name.c_str());
|
||||
}
|
||||
|
||||
// Add the default value to argDefaults. Note that we make this
|
||||
// call for all parameters, even those where no default value was
|
||||
// provided. In that case, a NULL value is stored here. This
|
||||
// approach means that we can always just look at the i'th entry of
|
||||
// argDefaults to find the default value for the i'th parameter.
|
||||
argDefaults.push_back(dynamic_cast<ConstExpr *>(defaultValue));
|
||||
}
|
||||
|
||||
// And only now can we set the default values in the FunctionType
|
||||
functionType->SetArgumentDefaults(argDefaults);
|
||||
|
||||
// If llvm gave us back a Function * with a different name than the one
|
||||
// we asked for, then there's already a function with that same
|
||||
// (mangled) name in the llvm::Module. In that case, erase the one we
|
||||
// tried to add and just work with the one it already had.
|
||||
if (function->getName() != functionName) {
|
||||
function->eraseFromParent();
|
||||
function = m->module->getFunction(functionName);
|
||||
}
|
||||
funSym->function = function;
|
||||
|
||||
// But if that function has a definition, we don't want to redefine it.
|
||||
if (!function->empty()) {
|
||||
Warning(funSym->pos, "Ignoring redefinition of function \"%s\".",
|
||||
funSym->name.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Finally, we know all is good and we can add the function to the
|
||||
// symbol table
|
||||
bool ok = m->symbolTable->AddFunction(funSym);
|
||||
assert(ok);
|
||||
return funSym;
|
||||
}
|
||||
|
||||
|
||||
const Type *
|
||||
Function::GetReturnType() const {
|
||||
return type->GetReturnType();
|
||||
}
|
||||
|
||||
|
||||
const FunctionType *
|
||||
Function::GetType() const {
|
||||
return type;
|
||||
}
|
||||
|
||||
|
||||
/** Parameters for tasks are stored in a big structure; this utility
|
||||
function emits code to copy those values out of the task structure into
|
||||
local stack-allocated variables. (Which we expect that LLVM's
|
||||
'mem2reg' pass will in turn promote to SSA registers..
|
||||
*/
|
||||
static void
|
||||
lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
|
||||
FunctionEmitContext *ctx) {
|
||||
// We expect the argument structure to come in as a poitner to a
|
||||
// structure. Confirm and figure out its type here.
|
||||
const llvm::Type *structArgType = structArgPtr->getType();
|
||||
assert(llvm::isa<llvm::PointerType>(structArgType));
|
||||
const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(structArgType);
|
||||
assert(llvm::isa<llvm::StructType>(pt->getElementType()));
|
||||
const llvm::StructType *argStructType =
|
||||
llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
|
||||
|
||||
// Get the type of the argument we're copying in and its Symbol pointer
|
||||
LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
|
||||
Symbol *sym = args[i];
|
||||
|
||||
// allocate space to copy the parameter in to
|
||||
sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
|
||||
|
||||
// get a pointer to the value in the struct
|
||||
llvm::Value *ptr = ctx->GetElementPtrInst(structArgPtr, 0, i, sym->name.c_str());
|
||||
|
||||
// and copy the value from the struct and into the local alloca'ed
|
||||
// memory
|
||||
llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, sym->name.c_str());
|
||||
ctx->StoreInst(ptrval, sym->storagePtr);
|
||||
ctx->EmitFunctionParameterDebugInfo(sym);
|
||||
}
|
||||
|
||||
|
||||
/** Given the statements implementing a function, emit the code that
|
||||
implements the function. Most of the work do be done here just
|
||||
involves wiring up the function parameter values to be available in the
|
||||
function body code.
|
||||
*/
|
||||
void
|
||||
Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
SourcePos firstStmtPos) {
|
||||
llvm::Value *maskPtr = ctx->AllocaInst(LLVMTypes::MaskType, "mask_memory");
|
||||
ctx->StoreInst(LLVMMaskAllOn, maskPtr);
|
||||
maskSymbol->storagePtr = maskPtr;
|
||||
ctx->SetMaskPointer(maskPtr);
|
||||
|
||||
// add debugging info for __mask, programIndex, ...
|
||||
maskSymbol->pos = firstStmtPos;
|
||||
ctx->EmitVariableDebugInfo(maskSymbol);
|
||||
|
||||
#if 0
|
||||
llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
|
||||
#endif
|
||||
if (type->isTask == true) {
|
||||
// For tasks, we there should always be three parmeters: the
|
||||
// pointer to the structure that holds all of the arguments, the
|
||||
// thread index, and the thread count variables.
|
||||
llvm::Function::arg_iterator argIter = function->arg_begin();
|
||||
llvm::Value *structParamPtr = argIter++;
|
||||
llvm::Value *threadIndex = argIter++;
|
||||
llvm::Value *threadCount = argIter++;
|
||||
llvm::Value *taskIndex = argIter++;
|
||||
llvm::Value *taskCount = argIter++;
|
||||
|
||||
// Copy the function parameter values from the structure into local
|
||||
// storage
|
||||
for (unsigned int i = 0; i < args.size(); ++i)
|
||||
lCopyInTaskParameter(i, structParamPtr, args, ctx);
|
||||
|
||||
// Copy in the mask as well.
|
||||
int nArgs = (int)args.size();
|
||||
// The mask is the last parameter in the argument structure
|
||||
llvm::Value *ptr = ctx->GetElementPtrInst(structParamPtr, 0, nArgs,
|
||||
"task_struct_mask");
|
||||
llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, "mask");
|
||||
ctx->SetEntryMask(ptrval);
|
||||
|
||||
// Copy threadIndex and threadCount into stack-allocated storage so
|
||||
// that their symbols point to something reasonable.
|
||||
threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
|
||||
ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
|
||||
|
||||
threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
|
||||
ctx->StoreInst(threadCount, threadCountSym->storagePtr);
|
||||
|
||||
// Copy taskIndex and taskCount into stack-allocated storage so
|
||||
// that their symbols point to something reasonable.
|
||||
taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
|
||||
ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
|
||||
|
||||
taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
|
||||
ctx->StoreInst(taskCount, taskCountSym->storagePtr);
|
||||
}
|
||||
else {
|
||||
// Regular, non-task function
|
||||
llvm::Function::arg_iterator argIter = function->arg_begin();
|
||||
for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
|
||||
Symbol *sym = args[i];
|
||||
argIter->setName(sym->name.c_str());
|
||||
|
||||
// Allocate stack storage for the parameter and emit code
|
||||
// to store the its value there.
|
||||
sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
|
||||
ctx->StoreInst(argIter, sym->storagePtr);
|
||||
ctx->EmitFunctionParameterDebugInfo(sym);
|
||||
}
|
||||
|
||||
// If the number of actual function arguments is equal to the
|
||||
// number of declared arguments in decl->functionArgs, then we
|
||||
// don't have a mask parameter, so set it to be all on. This
|
||||
// happens for exmaple with 'export'ed functions that the app
|
||||
// calls.
|
||||
if (argIter == function->arg_end())
|
||||
ctx->SetEntryMask(LLVMMaskAllOn);
|
||||
else {
|
||||
// Otherwise use the mask to set the entry mask value
|
||||
argIter->setName("__mask");
|
||||
assert(argIter->getType() == LLVMTypes::MaskType);
|
||||
ctx->SetEntryMask(argIter);
|
||||
assert(++argIter == function->arg_end());
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, we can generate code for the function
|
||||
if (code != NULL) {
|
||||
int costEstimate = code->EstimateCost();
|
||||
bool checkMask = (type->isTask == true) ||
|
||||
((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
|
||||
costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
|
||||
Debug(code->pos, "Estimated cost for function \"%s\" = %d\n",
|
||||
sym->name.c_str(), costEstimate);
|
||||
// If the body of the function is non-trivial, then we wrap the
|
||||
// entire thing around a varying "cif (true)" test in order to reap
|
||||
// the side-effect benefit of checking to see if the execution mask
|
||||
// is all on and thence having a specialized code path for that
|
||||
// case. If this is a simple function, then this isn't worth the
|
||||
// code bloat / overhead.
|
||||
if (checkMask) {
|
||||
bool allTrue[ISPC_MAX_NVEC];
|
||||
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||
allTrue[i] = true;
|
||||
Expr *trueExpr = new ConstExpr(AtomicType::VaryingBool, allTrue,
|
||||
code->pos);
|
||||
code = new IfStmt(trueExpr, code, NULL, true, code->pos);
|
||||
}
|
||||
|
||||
ctx->SetDebugPos(code->pos);
|
||||
ctx->AddInstrumentationPoint("function entry");
|
||||
code->EmitCode(ctx);
|
||||
}
|
||||
|
||||
if (ctx->GetCurrentBasicBlock()) {
|
||||
// FIXME: We'd like to issue a warning if we've reached the end of
|
||||
// the function without a return statement (for non-void
|
||||
// functions). But the test below isn't right, since we can have
|
||||
// (with 'x' a varying test) "if (x) return a; else return b;", in
|
||||
// which case we have a valid basic block but its unreachable so ok
|
||||
// to not have return statement.
|
||||
#if 0
|
||||
// If the bblock has no predecessors, then it doesn't matter if it
|
||||
// doesn't have a return; it'll never be reached. If it does,
|
||||
// issue a warning. Also need to warn if it's the entry block for
|
||||
// the function (in which case it will not have predeccesors but is
|
||||
// still reachable.)
|
||||
if (type->GetReturnType() != AtomicType::Void &&
|
||||
(pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
|
||||
Warning(sym->pos, "Missing return statement in function returning \"%s\".",
|
||||
type->rType->GetString().c_str());
|
||||
#endif
|
||||
|
||||
// FIXME: would like to set the context's current position to
|
||||
// e.g. the end of the function code
|
||||
|
||||
// if bblock is non-NULL, it hasn't been terminated by e.g. a
|
||||
// return instruction. Need to add a return instruction.
|
||||
ctx->ReturnInst();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Function::GenerateIR() {
|
||||
llvm::Function *function = sym->function;
|
||||
assert(function != NULL);
|
||||
|
||||
// Figure out a reasonable source file position for the start of the
|
||||
// function body. If possible, get the position of the first actual
|
||||
// non-StmtList statment...
|
||||
SourcePos firstStmtPos = sym->pos;
|
||||
if (code) {
|
||||
StmtList *sl = dynamic_cast<StmtList *>(code);
|
||||
if (sl && sl->GetStatements().size() > 0 &&
|
||||
sl->GetStatements()[0] != NULL)
|
||||
firstStmtPos = sl->GetStatements()[0]->pos;
|
||||
else
|
||||
firstStmtPos = code->pos;
|
||||
}
|
||||
|
||||
// And we can now go ahead and emit the code
|
||||
{
|
||||
FunctionEmitContext ec(this, sym, function, firstStmtPos);
|
||||
emitCode(&ec, function, firstStmtPos);
|
||||
}
|
||||
|
||||
if (m->errorCount == 0) {
|
||||
if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
|
||||
if (g->debugPrint) {
|
||||
llvm::PassManager ppm;
|
||||
ppm.add(llvm::createPrintModulePass(&llvm::outs()));
|
||||
ppm.run(*m->module);
|
||||
}
|
||||
FATAL("Function verificication failed");
|
||||
}
|
||||
|
||||
// If the function is 'export'-qualified, emit a second version of
|
||||
// it without a mask parameter and without name mangling so that
|
||||
// the application can call it
|
||||
if (isExported) {
|
||||
if (!type->isTask) {
|
||||
LLVM_TYPE_CONST llvm::FunctionType *ftype =
|
||||
type->LLVMFunctionType(g->ctx);
|
||||
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
|
||||
std::string functionName = sym->name;
|
||||
if (g->mangleFunctionsWithTarget)
|
||||
functionName += std::string("_") + g->target.GetISAString();
|
||||
llvm::Function *appFunction =
|
||||
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
|
||||
appFunction->setDoesNotThrow(true);
|
||||
|
||||
if (appFunction->getName() != functionName) {
|
||||
// this was a redefinition for which we already emitted an
|
||||
// error, so don't worry about this one...
|
||||
appFunction->eraseFromParent();
|
||||
}
|
||||
else {
|
||||
// And emit the code again
|
||||
FunctionEmitContext ec(this, sym, appFunction, firstStmtPos);
|
||||
emitCode(&ec, appFunction, firstStmtPos);
|
||||
if (m->errorCount == 0) {
|
||||
sym->exportedFunction = appFunction;
|
||||
if (llvm::verifyFunction(*appFunction,
|
||||
llvm::ReturnStatusAction) == true) {
|
||||
if (g->debugPrint) {
|
||||
llvm::PassManager ppm;
|
||||
ppm.add(llvm::createPrintModulePass(&llvm::outs()));
|
||||
ppm.run(*m->module);
|
||||
}
|
||||
FATAL("Function verificication failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
70
func.h
Normal file
70
func.h
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** @file func.h
|
||||
@brief Representation of a function in a source file.
|
||||
*/
|
||||
|
||||
#ifndef ISPC_FUNC_H
|
||||
#define ISPC_FUNC_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <vector>
|
||||
|
||||
class Function {
|
||||
public:
|
||||
Function(DeclSpecs *ds, Declarator *decl, Stmt *code);
|
||||
|
||||
static Symbol *InitFunctionSymbol(DeclSpecs *ds, Declarator *decl);
|
||||
|
||||
const Type *GetReturnType() const;
|
||||
const FunctionType *GetType() const;
|
||||
|
||||
/** Generate LLVM IR for the function into the current module. */
|
||||
void GenerateIR();
|
||||
|
||||
private:
|
||||
void emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
SourcePos firstStmtPos);
|
||||
|
||||
Symbol *sym;
|
||||
const FunctionType *type;
|
||||
std::vector<Symbol *> args;
|
||||
Stmt *code;
|
||||
bool isExported;
|
||||
Symbol *maskSymbol;
|
||||
Symbol *threadIndexSym, *threadCountSym;
|
||||
Symbol *taskIndexSym, *taskCountSym;
|
||||
};
|
||||
|
||||
#endif // ISPC_FUNC_H
|
||||
36
ispc.cpp
36
ispc.cpp
@@ -85,7 +85,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
if (isa == NULL) {
|
||||
if (!strcasecmp(cpu, "atom"))
|
||||
isa = "sse2";
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
else if (!strcasecmp(cpu, "sandybridge") ||
|
||||
!strcasecmp(cpu, "corei7-avx"))
|
||||
isa = "avx";
|
||||
@@ -135,7 +135,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->vectorWidth = 4;
|
||||
t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
|
||||
}
|
||||
else if (!strcasecmp(isa, "sse4x2")) {
|
||||
else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
|
||||
t->isa = Target::SSE4;
|
||||
t->nativeVectorWidth = 4;
|
||||
t->vectorWidth = 8;
|
||||
@@ -174,7 +174,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
const char *
|
||||
Target::SupportedTargetCPUs() {
|
||||
return "atom, barcelona, core2, corei7, "
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
"corei7-avx, "
|
||||
#endif
|
||||
"istanbul, nocona, penryn, "
|
||||
@@ -193,8 +193,8 @@ Target::SupportedTargetArchs() {
|
||||
|
||||
const char *
|
||||
Target::SupportedTargetISAs() {
|
||||
return "sse2, sse4, sse4x2"
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
|
||||
return "sse2, sse4, sse4-x2"
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
", avx, avx-x2"
|
||||
#endif
|
||||
;
|
||||
@@ -241,7 +241,9 @@ Target::GetTargetMachine() const {
|
||||
std::string featuresString = cpu + std::string(",") + attributes;
|
||||
llvm::TargetMachine *targetMachine =
|
||||
target->createTargetMachine(triple, featuresString);
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
targetMachine->setRelocationModel(relocModel);
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
#endif
|
||||
assert(targetMachine != NULL);
|
||||
|
||||
@@ -250,6 +252,23 @@ Target::GetTargetMachine() const {
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
Target::GetISAString() const {
|
||||
switch (isa) {
|
||||
case Target::SSE2:
|
||||
return "sse2";
|
||||
case Target::SSE4:
|
||||
return "sse4";
|
||||
case Target::AVX:
|
||||
return "avx";
|
||||
break;
|
||||
default:
|
||||
FATAL("Unhandled target in GetISAString()");
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Opt
|
||||
|
||||
@@ -281,6 +300,7 @@ Globals::Globals() {
|
||||
emitPerfWarnings = true;
|
||||
emitInstrumentation = false;
|
||||
generateDebuggingSymbols = false;
|
||||
mangleFunctionsWithTarget = false;
|
||||
|
||||
ctx = new llvm::LLVMContext;
|
||||
|
||||
@@ -291,12 +311,6 @@ Globals::Globals() {
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// ASTNode
|
||||
|
||||
ASTNode::~ASTNode() {
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// SourcePos
|
||||
|
||||
|
||||
48
ispc.h
48
ispc.h
@@ -90,8 +90,8 @@ class Declarator;
|
||||
class FunctionEmitContext;
|
||||
class Expr;
|
||||
class ExprList;
|
||||
class Function;
|
||||
class FunctionType;
|
||||
class GatherBuffer;
|
||||
class Module;
|
||||
class Stmt;
|
||||
class Symbol;
|
||||
@@ -124,37 +124,6 @@ struct SourcePos {
|
||||
};
|
||||
|
||||
|
||||
/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
|
||||
|
||||
This class defines a basic interface that all abstract syntax tree
|
||||
(AST) nodes must implement. The base classes for both expressions
|
||||
(Expr) and statements (Stmt) inherit from this class.
|
||||
*/
|
||||
class ASTNode {
|
||||
public:
|
||||
ASTNode(SourcePos p) : pos(p) { }
|
||||
virtual ~ASTNode();
|
||||
|
||||
/** The Optimize() method should perform any appropriate early-stage
|
||||
optimizations on the node (e.g. constant folding). The caller
|
||||
should use the returned ASTNode * in place of the original node.
|
||||
This method may return NULL if an error is encountered during
|
||||
optimization. */
|
||||
virtual ASTNode *Optimize() = 0;
|
||||
|
||||
/** Type checking should be performed by the node when this method is
|
||||
called. In the event of an error, a NULL value may be returned.
|
||||
As with ASTNode::Optimize(), the caller should store the returned
|
||||
pointer in place of the original ASTNode *. */
|
||||
virtual ASTNode *TypeCheck() = 0;
|
||||
|
||||
virtual int EstimateCost() const = 0;
|
||||
|
||||
/** All AST nodes must track the file position where they are
|
||||
defined. */
|
||||
const SourcePos pos;
|
||||
};
|
||||
|
||||
/** @brief Structure that defines a compilation target
|
||||
|
||||
This structure defines a compilation target for the ispc compiler.
|
||||
@@ -185,13 +154,20 @@ struct Target {
|
||||
/** Returns the LLVM TargetMachine object corresponding to this
|
||||
target. */
|
||||
llvm::TargetMachine *GetTargetMachine() const;
|
||||
|
||||
/** Returns a string like "avx" encoding the target. */
|
||||
const char *GetISAString() const;
|
||||
|
||||
/** llvm Target object representing this target. */
|
||||
const llvm::Target *target;
|
||||
|
||||
/** Enumerator giving the instruction sets that the compiler can
|
||||
target. */
|
||||
enum ISA { SSE2, SSE4, AVX };
|
||||
target. These should be ordered from "worse" to "better" in that
|
||||
if a processor supports multiple target ISAs, then the most
|
||||
flexible/performant of them will apear last in the enumerant. Note
|
||||
also that __best_available_isa() needs to be updated if ISAs are
|
||||
added or the enumerant values are reordered. */
|
||||
enum ISA { SSE2, SSE4, AVX, NUM_ISAS };
|
||||
|
||||
/** Instruction set being compiled to. */
|
||||
ISA isa;
|
||||
@@ -354,6 +330,10 @@ struct Globals {
|
||||
/** Indicates whether ispc should generate debugging symbols for the
|
||||
program in its output. */
|
||||
bool generateDebuggingSymbols;
|
||||
|
||||
/** If true, function names are mangled by appending the target ISA and
|
||||
vector width to them. */
|
||||
bool mangleFunctionsWithTarget;
|
||||
|
||||
/** Global LLVMContext object */
|
||||
llvm::LLVMContext *ctx;
|
||||
|
||||
68
ispc.vcxproj
68
ispc.vcxproj
@@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
@@ -11,32 +11,43 @@
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ast.cpp" />
|
||||
<ClCompile Include="builtins.cpp" />
|
||||
<ClCompile Include="ctx.cpp" />
|
||||
<ClCompile Include="decl.cpp" />
|
||||
<ClCompile Include="expr.cpp" />
|
||||
<ClCompile Include="func.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
||||
<ClCompile Include="gen-bitcode-dispatch.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-sse4-x2.cpp" />
|
||||
<ClCompile Include="gen-stdlib.cpp" />
|
||||
<ClCompile Include="ispc.cpp" />
|
||||
<ClCompile Include="lex.cc" />
|
||||
<ClCompile Include="lex.cc">
|
||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
|
||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<ClCompile Include="llvmutil.cpp" />
|
||||
<ClCompile Include="module.cpp" />
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="opt.cpp" />
|
||||
<ClCompile Include="parse.cc" />
|
||||
<ClCompile Include="parse.cc">
|
||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
|
||||
<DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<CustomBuild Include="builtins-c.c">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
||||
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c > gen-bitcode-c-32.cpp;
|
||||
%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c > gen-bitcode-c-64.cpp</Command>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
|
||||
</CustomBuild>
|
||||
<ClCompile Include="stmt.cpp" />
|
||||
<ClCompile Include="sym.cpp" />
|
||||
@@ -44,10 +55,12 @@
|
||||
<ClCompile Include="util.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="ast.h" />
|
||||
<ClInclude Include="builtins.h" />
|
||||
<ClInclude Include="ctx.h" />
|
||||
<ClInclude Include="decl.h" />
|
||||
<ClInclude Include="expr.h" />
|
||||
<ClInclude Include="func.h" />
|
||||
<ClInclude Include="ispc.h" />
|
||||
<ClInclude Include="llvmutil.h" />
|
||||
<ClInclude Include="module.h" />
|
||||
@@ -61,9 +74,9 @@
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stdlib.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
|
||||
@@ -83,16 +96,29 @@
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins-sse4x2.ll">
|
||||
<CustomBuild Include="builtins-dispatch.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll > gen-bitcode-dispatch.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins-sse4-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll > gen-bitcode-sse4-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
@@ -194,7 +220,7 @@
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
@@ -202,7 +228,7 @@
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
@@ -212,7 +238,7 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
@@ -222,7 +248,7 @@
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -75,7 +75,6 @@ extern "C" {
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/ExecutionEngine/ExecutionEngine.h>
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
#include <llvm/ExecutionEngine/MCJIT.h>
|
||||
#include <llvm/Support/TargetRegistry.h>
|
||||
#include <llvm/Support/TargetSelect.h>
|
||||
#else
|
||||
@@ -99,26 +98,29 @@ extern "C" {
|
||||
bool shouldFail = false;
|
||||
|
||||
extern "C" {
|
||||
void ISPCLaunch(void *, void *);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
void ISPCLaunch(void **, void *, void *, int32_t);
|
||||
void ISPCSync(void *);
|
||||
void *ISPCAlloc(void **, int64_t size, int32_t alignment);
|
||||
}
|
||||
|
||||
void ISPCLaunch(void *func, void *data) {
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
void ISPCLaunch(void **handle, void *func, void *data, int32_t count) {
|
||||
*handle = (void *)0xdeadbeef;
|
||||
typedef void (*TaskFuncType)(void *, int, int, int, int);
|
||||
TaskFuncType tft = (TaskFuncType)(func);
|
||||
tft(data, 0, 1);
|
||||
for (int i = 0; i < count; ++i)
|
||||
tft(data, 0, 1, i, count);
|
||||
}
|
||||
|
||||
|
||||
void ISPCSync() {
|
||||
void ISPCSync(void *) {
|
||||
}
|
||||
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
|
||||
*handle = (void *)0xdeadbeef;
|
||||
// leak time!
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
return _aligned_malloc((size_t)size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
@@ -134,18 +136,6 @@ void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void usage(int ret) {
|
||||
fprintf(stderr, "usage: ispc_test\n");
|
||||
fprintf(stderr, "\t[-h/--help]\tprint help\n");
|
||||
@@ -218,8 +208,7 @@ static bool lRunTest(const char *fn) {
|
||||
ee->addGlobalMapping(func, (void *)FUNC)
|
||||
DO_FUNC(ISPCLaunch, "ISPCLaunch");
|
||||
DO_FUNC(ISPCSync, "ISPCSync");
|
||||
DO_FUNC(ISPCMalloc, "ISPCMalloc");
|
||||
DO_FUNC(ISPCFree, "ISPCFree");
|
||||
DO_FUNC(ISPCAlloc, "ISPCAlloc");
|
||||
DO_FUNC(putchar, "putchar");
|
||||
DO_FUNC(printf, "printf");
|
||||
DO_FUNC(fflush, "fflush");
|
||||
|
||||
@@ -52,14 +52,15 @@
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
@@ -70,8 +71,9 @@
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
|
||||
<DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
@@ -79,10 +81,10 @@
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
22
main.cpp
22
main.cpp
@@ -96,7 +96,9 @@ static void usage(int ret) {
|
||||
printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
|
||||
printf(" disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
|
||||
#endif
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
printf(" [--pic]\t\t\t\tGenerate position-independent code\n");
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
printf(" [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
|
||||
printf(" [--version]\t\t\t\tPrint ispc version\n");
|
||||
printf(" [--woff]\t\t\t\tDisable warnings\n");
|
||||
@@ -302,8 +304,10 @@ int main(int Argc, char *Argv[]) {
|
||||
g->includeStdlib = false;
|
||||
else if (!strcmp(argv[i], "--nocpp"))
|
||||
g->runCPP = false;
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
else if (!strcmp(argv[i], "--pic"))
|
||||
generatePIC = true;
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
|
||||
printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n",
|
||||
BUILD_DATE, BUILD_VERSION);
|
||||
@@ -325,20 +329,6 @@ int main(int Argc, char *Argv[]) {
|
||||
if (debugSet && !optSet)
|
||||
g->opt.level = 0;
|
||||
|
||||
if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
|
||||
usage(1);
|
||||
|
||||
m = new Module(file);
|
||||
if (m->CompileFile() == 0) {
|
||||
if (outFileName != NULL)
|
||||
if (!m->WriteOutput(ot, outFileName))
|
||||
return 1;
|
||||
if (headerFileName != NULL)
|
||||
if (!m->WriteOutput(Module::Header, headerFileName))
|
||||
return 1;
|
||||
}
|
||||
int errorCount = m->errorCount;
|
||||
delete m;
|
||||
|
||||
return errorCount > 0;
|
||||
return Module::CompileAndOutput(file, arch, cpu, target, generatePIC,
|
||||
ot, outFileName, headerFileName);
|
||||
}
|
||||
|
||||
1164
module.cpp
1164
module.cpp
File diff suppressed because it is too large
Load Diff
62
module.h
62
module.h
@@ -40,10 +40,11 @@
|
||||
#define ISPC_MODULE_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include "ast.h"
|
||||
|
||||
namespace llvm
|
||||
{
|
||||
class raw_string_ostream;
|
||||
class raw_string_ostream;
|
||||
}
|
||||
|
||||
class Module {
|
||||
@@ -75,11 +76,37 @@ public:
|
||||
variables, and the types used by them. */
|
||||
};
|
||||
|
||||
/** Write the corresponding output type to the given file. Returns
|
||||
true on success, false if there has been an error. The given
|
||||
filename may be NULL, indicating that output should go to standard
|
||||
output. */
|
||||
bool WriteOutput(OutputType ot, const char *filename);
|
||||
/** Compile the given source file, generating assembly, object file, or
|
||||
LLVM bitcode output, as well as (optionally) a header file with
|
||||
declarations of functions and types used in the ispc/application
|
||||
interface.
|
||||
@param srcFile Pathname to ispc source file to compile
|
||||
@param arch Target architecture (e.g. "x86-64")
|
||||
@param cpu Target CPU (e.g. "core-i7")
|
||||
@param targets Target ISAs; this parameter may give a single target
|
||||
ISA, or may give a comma-separated list of them in
|
||||
case we are compiling to multiple ISAs.
|
||||
@param generatePIC Indicates whether position-independent code should
|
||||
be generated.
|
||||
@param outputType Type of output to generate (object files, assembly,
|
||||
LLVM bitcode.)
|
||||
@param outFileName Base name of output filename for object files, etc.
|
||||
If for example the multiple targets "sse2" and "avx"
|
||||
are specified in the "targets" parameter and if this
|
||||
parameter is "foo.o", then we'll generate multiple
|
||||
output files, like "foo.o", "foo_sse2.o", "foo_avx.o".
|
||||
@param headerFileName If non-NULL, emit a header file suitable for
|
||||
inclusion from C/C++ code with declarations of
|
||||
types and functions exported from the given ispc
|
||||
source file.
|
||||
@return Number of errors encountered when compiling
|
||||
srcFile.
|
||||
*/
|
||||
static int CompileAndOutput(const char *srcFile, const char *arch,
|
||||
const char *cpu, const char *targets,
|
||||
bool generatePIC, OutputType outputType,
|
||||
const char *outFileName,
|
||||
const char *headerFileName);
|
||||
|
||||
/** Total number of errors encountered during compilation. */
|
||||
int errorCount;
|
||||
@@ -94,24 +121,23 @@ public:
|
||||
/** The diBuilder manages generating debugging information */
|
||||
llvm::DIBuilder *diBuilder;
|
||||
|
||||
GatherBuffer *gatherBuffer;
|
||||
|
||||
private:
|
||||
const char *filename;
|
||||
AST *ast;
|
||||
|
||||
/** This member records the global variables that have been defined
|
||||
with 'extern' linkage, so that it's easy to include their
|
||||
declarations in generated header files.
|
||||
|
||||
@todo FIXME: it would be nice to eliminate this and then query the
|
||||
symbol table or the llvm Module for them when/if we need them.
|
||||
*/
|
||||
std::vector<Symbol *> externGlobals;
|
||||
|
||||
/** Write the corresponding output type to the given file. Returns
|
||||
true on success, false if there has been an error. The given
|
||||
filename may be NULL, indicating that output should go to standard
|
||||
output. */
|
||||
bool writeOutput(OutputType ot, const char *filename);
|
||||
bool writeHeader(const char *filename);
|
||||
bool writeObjectFileOrAssembly(OutputType outputType, const char *filename);
|
||||
void execPreprocessor(const char *infilename, llvm::raw_string_ostream* ostream) const;
|
||||
static bool writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
|
||||
llvm::Module *module, OutputType outputType,
|
||||
const char *outFileName);
|
||||
static bool writeBitcode(llvm::Module *module, const char *outFileName);
|
||||
|
||||
void execPreprocessor(const char *infilename, llvm::raw_string_ostream* ostream) const;
|
||||
};
|
||||
|
||||
#endif // ISPC_MODULE_H
|
||||
|
||||
30
opt.cpp
30
opt.cpp
@@ -204,6 +204,7 @@ Optimize(llvm::Module *module, int optLevel) {
|
||||
optPM.add(CreateIsCompileTimeConstantPass(true));
|
||||
optPM.add(llvm::createFunctionInliningPass());
|
||||
optPM.add(CreateMakeInternalFuncsStaticPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
optPM.add(llvm::createGlobalDCEPass());
|
||||
}
|
||||
else {
|
||||
@@ -678,7 +679,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||
bool
|
||||
IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
|
||||
for (unsigned int i = 0; i < maskInstructions.size(); ++i)
|
||||
if (function == maskInstructions[i].function)
|
||||
if (maskInstructions[i].function != NULL &&
|
||||
function == maskInstructions[i].function)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
@@ -687,7 +689,8 @@ IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
|
||||
IntrinsicsOpt::BlendInstruction *
|
||||
IntrinsicsOpt::matchingBlendInstruction(llvm::Function *function) {
|
||||
for (unsigned int i = 0; i < blendInstructions.size(); ++i)
|
||||
if (function == blendInstructions[i].function)
|
||||
if (blendInstructions[i].function != NULL &&
|
||||
function == blendInstructions[i].function)
|
||||
return &blendInstructions[i];
|
||||
return NULL;
|
||||
}
|
||||
@@ -1147,7 +1150,8 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||
continue;
|
||||
GSInfo *info = NULL;
|
||||
for (int i = 0; i < numGSFuncs; ++i)
|
||||
if (callInst->getCalledFunction() == gsFuncs[i].func) {
|
||||
if (gsFuncs[i].func != NULL &&
|
||||
callInst->getCalledFunction() == gsFuncs[i].func) {
|
||||
info = &gsFuncs[i];
|
||||
break;
|
||||
}
|
||||
@@ -1288,7 +1292,7 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||
int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
|
||||
MSInfo *info = NULL;
|
||||
for (int i = 0; i < nMSFuncs; ++i) {
|
||||
if (called == msInfo[i].func) {
|
||||
if (msInfo[i].func != NULL && called == msInfo[i].func) {
|
||||
info = &msInfo[i];
|
||||
break;
|
||||
}
|
||||
@@ -1428,7 +1432,8 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||
continue;
|
||||
LMSInfo *info = NULL;
|
||||
for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
|
||||
if (callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
|
||||
if (msInfo[i].pseudoFunc != NULL &&
|
||||
callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
|
||||
info = &msInfo[i];
|
||||
break;
|
||||
}
|
||||
@@ -2151,13 +2156,15 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||
GatherImpInfo *gatherInfo = NULL;
|
||||
ScatterImpInfo *scatterInfo = NULL;
|
||||
for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
|
||||
if (calledFunc == gInfo[i].pseudoFunc) {
|
||||
if (gInfo[i].pseudoFunc != NULL &&
|
||||
calledFunc == gInfo[i].pseudoFunc) {
|
||||
gatherInfo = &gInfo[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
|
||||
if (calledFunc == sInfo[i].pseudoFunc) {
|
||||
if (sInfo[i].pseudoFunc != NULL &&
|
||||
calledFunc == sInfo[i].pseudoFunc) {
|
||||
scatterInfo = &sInfo[i];
|
||||
break;
|
||||
}
|
||||
@@ -2402,7 +2409,8 @@ LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||
llvm::Function *calledFunc = callInst->getCalledFunction();
|
||||
LowerGSInfo *info = NULL;
|
||||
for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
|
||||
if (calledFunc == lgsInfo[i].pseudoFunc) {
|
||||
if (lgsInfo[i].pseudoFunc != NULL &&
|
||||
calledFunc == lgsInfo[i].pseudoFunc) {
|
||||
info = &lgsInfo[i];
|
||||
break;
|
||||
}
|
||||
@@ -2488,7 +2496,7 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||
int j;
|
||||
int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
|
||||
for (j = 0; j < nFuncs; ++j) {
|
||||
if (callInst->getCalledFunction() == funcs[j])
|
||||
if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
|
||||
break;
|
||||
}
|
||||
if (j == nFuncs)
|
||||
@@ -2568,7 +2576,7 @@ llvm::RegisterPass<MakeInternalFuncsStaticPass>
|
||||
bool
|
||||
MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
|
||||
const char *names[] = {
|
||||
"__do_print", "__fast_masked_vload",
|
||||
"__do_print", "__fast_masked_vload", "__num_cores",
|
||||
"__gather_base_offsets_i8", "__gather_base_offsets_i16",
|
||||
"__gather_base_offsets_i32", "__gather_base_offsets_i64",
|
||||
"__gather_elt_8", "__gather_elt_16",
|
||||
@@ -2593,7 +2601,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
llvm::Function *f = m->module->getFunction(names[i]);
|
||||
if (f != NULL) {
|
||||
f->setLinkage(llvm::GlobalValue::PrivateLinkage);
|
||||
f->setLinkage(llvm::GlobalValue::InternalLinkage);
|
||||
modifiedAny = true;
|
||||
}
|
||||
}
|
||||
|
||||
75
parse.yy
75
parse.yy
@@ -165,7 +165,7 @@ static const char *lParamListTokens[] = {
|
||||
%token TOKEN_CBREAK TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT
|
||||
|
||||
%type <expr> primary_expression postfix_expression
|
||||
%type <expr> unary_expression cast_expression
|
||||
%type <expr> unary_expression cast_expression launch_expression
|
||||
%type <expr> multiplicative_expression additive_expression shift_expression
|
||||
%type <expr> relational_expression equality_expression and_expression
|
||||
%type <expr> exclusive_or_expression inclusive_or_expression
|
||||
@@ -177,6 +177,7 @@ static const char *lParamListTokens[] = {
|
||||
%type <stmt> statement labeled_statement compound_statement for_init_statement
|
||||
%type <stmt> expression_statement selection_statement iteration_statement
|
||||
%type <stmt> jump_statement statement_list declaration_statement print_statement
|
||||
%type <stmt> sync_statement
|
||||
|
||||
%type <declaration> declaration parameter_declaration
|
||||
%type <declarators> init_declarator_list
|
||||
@@ -221,7 +222,7 @@ primary_expression
|
||||
else {
|
||||
std::vector<Symbol *> *funs = m->symbolTable->LookupFunction(name);
|
||||
if (funs)
|
||||
$$ = new FunctionSymbolExpr(funs, @1);
|
||||
$$ = new FunctionSymbolExpr(name, funs, @1);
|
||||
}
|
||||
if ($$ == NULL) {
|
||||
std::vector<std::string> alternates =
|
||||
@@ -256,18 +257,32 @@ primary_expression
|
||||
| '(' expression ')' { $$ = $2; }
|
||||
;
|
||||
|
||||
launch_expression
|
||||
: TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
|
||||
{
|
||||
ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
|
||||
$$ = new FunctionCallExpr($3, $5, @3, true, oneExpr);
|
||||
}
|
||||
| TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
|
||||
{
|
||||
ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
|
||||
$$ = new FunctionCallExpr($3, new ExprList(@3), @3, true, oneExpr);
|
||||
}
|
||||
| TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
|
||||
{ $$ = new FunctionCallExpr($6, $8, @6, true, $3); }
|
||||
| TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>'
|
||||
{ $$ = new FunctionCallExpr($6, new ExprList(@6), @6, true, $3); }
|
||||
;
|
||||
|
||||
postfix_expression
|
||||
: primary_expression
|
||||
| postfix_expression '[' expression ']'
|
||||
{ $$ = new IndexExpr($1, $3, @1); }
|
||||
| postfix_expression '(' ')'
|
||||
{ $$ = new FunctionCallExpr($1, new ExprList(@1), @1, false); }
|
||||
{ $$ = new FunctionCallExpr($1, new ExprList(@1), @1); }
|
||||
| postfix_expression '(' argument_expression_list ')'
|
||||
{ $$ = new FunctionCallExpr($1, $3, @1, false); }
|
||||
| TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
|
||||
{ $$ = new FunctionCallExpr($3, $5, @3, true); }
|
||||
| TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
|
||||
{ $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true); }
|
||||
{ $$ = new FunctionCallExpr($1, $3, @1); }
|
||||
| launch_expression
|
||||
| postfix_expression '.' TOKEN_IDENTIFIER
|
||||
{ $$ = MemberExpr::create($1, yytext, @1, @3); }
|
||||
/* | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
|
||||
@@ -436,8 +451,6 @@ assignment_expression
|
||||
|
||||
expression
|
||||
: assignment_expression
|
||||
| TOKEN_SYNC
|
||||
{ $$ = new SyncExpr(@1); }
|
||||
| expression ',' assignment_expression
|
||||
{ $$ = new BinaryExpr(BinaryExpr::Comma, $1, $3, @2); }
|
||||
;
|
||||
@@ -928,9 +941,13 @@ parameter_list
|
||||
builtinTokens.push_back(*token);
|
||||
++token;
|
||||
}
|
||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
||||
if (strlen(yytext) == 0)
|
||||
Error(@1, "Syntax error--premature end of file.");
|
||||
else {
|
||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
|
||||
}
|
||||
$$ = NULL;
|
||||
}
|
||||
;
|
||||
@@ -1019,6 +1036,7 @@ statement
|
||||
| jump_statement
|
||||
| declaration_statement
|
||||
| print_statement
|
||||
| sync_statement
|
||||
| error
|
||||
{
|
||||
std::vector<std::string> builtinTokens;
|
||||
@@ -1027,9 +1045,13 @@ statement
|
||||
builtinTokens.push_back(*token);
|
||||
++token;
|
||||
}
|
||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
||||
if (strlen(yytext) == 0)
|
||||
Error(@1, "Syntax error--premature end of file.");
|
||||
else {
|
||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
|
||||
}
|
||||
$$ = NULL;
|
||||
}
|
||||
;
|
||||
@@ -1155,6 +1177,11 @@ jump_statement
|
||||
{ $$ = new ReturnStmt($2, true, @1); }
|
||||
;
|
||||
|
||||
sync_statement
|
||||
: TOKEN_SYNC
|
||||
{ $$ = new ExprStmt(new SyncExpr(@1), @1); }
|
||||
;
|
||||
|
||||
print_statement
|
||||
: TOKEN_PRINT '(' string_constant ')'
|
||||
{
|
||||
@@ -1177,9 +1204,13 @@ translation_unit
|
||||
builtinTokens.push_back(*token);
|
||||
++token;
|
||||
}
|
||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
|
||||
if (strlen(yytext) == 0)
|
||||
Error(@1, "Syntax error--premature end of file.");
|
||||
else {
|
||||
std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
|
||||
std::string alts = lGetAlternates(alternates);
|
||||
Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
@@ -1266,6 +1297,12 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) {
|
||||
|
||||
Symbol *threadCountSym = new Symbol("threadCount", pos, AtomicType::UniformConstUInt32);
|
||||
m->symbolTable->AddVariable(threadCountSym);
|
||||
|
||||
Symbol *taskIndexSym = new Symbol("taskIndex", pos, AtomicType::UniformConstUInt32);
|
||||
m->symbolTable->AddVariable(taskIndexSym);
|
||||
|
||||
Symbol *taskCountSym = new Symbol("taskCount", pos, AtomicType::UniformConstUInt32);
|
||||
m->symbolTable->AddVariable(taskCountSym);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ import random
|
||||
import string
|
||||
import mutex
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
parser = OptionParser()
|
||||
parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
|
||||
@@ -25,7 +26,7 @@ parser.add_option("-s", "--static-exe", dest="static_exe",
|
||||
help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
|
||||
default=False, action="store_true")
|
||||
parser.add_option('-t', '--target', dest='target',
|
||||
help='Set compilation target (sse2, sse4, sse4x2, avx, avx-x2)',
|
||||
help='Set compilation target (sse2, sse4, sse4-x2, avx, avx-x2)',
|
||||
default="sse4")
|
||||
parser.add_option('-a', '--arch', dest='arch',
|
||||
help='Set architecture (x86, x86-64)',
|
||||
@@ -137,8 +138,10 @@ def run_tasks_from_queue(queue):
|
||||
gcc_arch = '-m32'
|
||||
else:
|
||||
gcc_arch = '-m64'
|
||||
gcc_cmd = "g++ -Wl,-no_pie %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
|
||||
gcc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
|
||||
(gcc_arch, match, filename, exe_name)
|
||||
if platform.system() == 'Darwin':
|
||||
gcc_cmd += ' -Wl,-no_pie'
|
||||
if should_fail:
|
||||
gcc_cmd += " -DEXPECT_FAILURE"
|
||||
|
||||
|
||||
164
simple.vcxproj
Executable file
164
simple.vcxproj
Executable file
@@ -0,0 +1,164 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="morph.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="morph.vo">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
cl /E /TP %(Filename).vo | volta -O2 - -o %(Filename).obj -h %(Filename).h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
cl /E /TP %(Filename).vo | volta -O2 -o %(Filename).obj -h %(Filename).h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
cl /E /TP %(Filename).vo | volta -O2 - -o %(Filename).obj -h %(Filename).h --arch=x86
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
cl /E /TP %(Filename).vo | volta -O2 - -o %(Filename).obj -h %(Filename).h
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{947C5311-8B78-4D05-BEE4-BCF342D4B367}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>morph</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
130
stdlib.ispc
130
stdlib.ispc
@@ -369,7 +369,7 @@ static inline uniform float reduce_min(float v) {
|
||||
static inline uniform float reduce_max(float v) {
|
||||
// For the lanes where the mask is off, replace the given value with
|
||||
// negative infinity, so that it doesn't affect the result.
|
||||
const uniform int iflt_neg_max = 0xff800000; // -infinity
|
||||
const int iflt_neg_max = 0xff800000; // -infinity
|
||||
// Must use __floatbits_varying_int32, not floatbits(), since with the
|
||||
// latter the current mask enters into the returned result...
|
||||
return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
|
||||
@@ -427,7 +427,7 @@ static inline uniform double reduce_min(double v) {
|
||||
}
|
||||
|
||||
static inline uniform double reduce_max(double v) {
|
||||
const uniform int64 iflt_neg_max = 0xfff0000000000000; // -infinity
|
||||
const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
|
||||
// Must use __doublebits_varying_int64, not doublebits(), since with the
|
||||
// latter the current mask enters into the returned result...
|
||||
return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
|
||||
@@ -471,21 +471,21 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
|
||||
return __reduce_max_uint64(__mask ? v : 0);
|
||||
}
|
||||
|
||||
#define REDUCE_EQUAL(TYPE, FUNCTYPE) \
|
||||
#define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE) \
|
||||
static inline uniform bool reduce_equal(TYPE v) { \
|
||||
uniform TYPE unusedValue; \
|
||||
return __reduce_equal_##FUNCTYPE(v, unusedValue, (int32)__mask); \
|
||||
return __reduce_equal_##FUNCTYPE(v, unusedValue, (MASKTYPE)__mask); \
|
||||
} \
|
||||
static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
|
||||
return __reduce_equal_##FUNCTYPE(v, value, (int32)__mask); \
|
||||
return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask); \
|
||||
}
|
||||
|
||||
REDUCE_EQUAL(int32, int32)
|
||||
REDUCE_EQUAL(unsigned int32, int32)
|
||||
REDUCE_EQUAL(float, float)
|
||||
REDUCE_EQUAL(int64, int64)
|
||||
REDUCE_EQUAL(unsigned int64, int64)
|
||||
REDUCE_EQUAL(double, double)
|
||||
REDUCE_EQUAL(int32, int32, int32)
|
||||
REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
|
||||
REDUCE_EQUAL(float, float, int32)
|
||||
REDUCE_EQUAL(int64, int64, int32)
|
||||
REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
|
||||
REDUCE_EQUAL(double, double, int32)
|
||||
|
||||
static int32 exclusive_scan_add(int32 v) {
|
||||
return __exclusive_scan_add_i32(v, (int32)__mask);
|
||||
@@ -549,23 +549,32 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
|
||||
static inline uniform int
|
||||
packed_load_active(uniform unsigned int a[], uniform int start,
|
||||
reference unsigned int vals) {
|
||||
return __packed_load_active(a, start, vals, __mask);
|
||||
return __packed_load_active(a, (unsigned int)start, vals,
|
||||
(unsigned int32)__mask);
|
||||
}
|
||||
|
||||
static inline uniform int
|
||||
packed_store_active(uniform unsigned int a[], uniform int start,
|
||||
unsigned int vals) {
|
||||
return __packed_store_active(a, start, vals, __mask);
|
||||
return __packed_store_active(a, (unsigned int)start, vals,
|
||||
(unsigned int32)__mask);
|
||||
}
|
||||
|
||||
static inline uniform int packed_load_active(uniform int a[], uniform int start,
|
||||
reference int vals) {
|
||||
return __packed_load_active(a, start, vals, __mask);
|
||||
return __packed_load_active(a, start, vals, (int32)__mask);
|
||||
}
|
||||
|
||||
static inline uniform int packed_store_active(uniform int a[], uniform int start,
|
||||
int vals) {
|
||||
return __packed_store_active(a, start, vals, __mask);
|
||||
return __packed_store_active(a, start, vals, (int32)__mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// System information
|
||||
|
||||
static inline int num_cores() {
|
||||
return __num_cores();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -581,24 +590,38 @@ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
|
||||
TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
|
||||
static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
|
||||
uniform TA oneval = reduce_##OPA(value); \
|
||||
TA ret; \
|
||||
if (lanemask() != 0) { \
|
||||
memory_barrier(); \
|
||||
ret = __atomic_##OPB##_##TB##_global(ref, oneval, __mask); \
|
||||
ret = __atomic_##OPB##_uniform_##TB##_global(ref, oneval, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
|
||||
@@ -606,56 +629,63 @@ DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)
|
||||
|
||||
DEFINE_ATOMIC_OP(float,float,swap,swap,int32)
|
||||
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
|
||||
DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)
|
||||
|
||||
DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
|
||||
|
||||
#undef DEFINE_ATOMIC_OP
|
||||
|
||||
#define ATOMIC_DECL_CMPXCHG(TA, TB) \
|
||||
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
uniform reference TA ref, TA oldval, TA newval) { \
|
||||
memory_barrier(); \
|
||||
TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, __mask); \
|
||||
TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_compare_exchange_global( \
|
||||
uniform reference TA ref, uniform TA oldval, uniform TA newval) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
ATOMIC_DECL_CMPXCHG(int32, int32)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int32, int32)
|
||||
ATOMIC_DECL_CMPXCHG(float, float)
|
||||
ATOMIC_DECL_CMPXCHG(int64, int64)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
|
||||
ATOMIC_DECL_CMPXCHG(double, double)
|
||||
ATOMIC_DECL_CMPXCHG(int32, int32, int32)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
|
||||
ATOMIC_DECL_CMPXCHG(float, float, int32)
|
||||
ATOMIC_DECL_CMPXCHG(int64, int64, int32)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
|
||||
ATOMIC_DECL_CMPXCHG(double, double, int32)
|
||||
|
||||
#undef ATOMIC_DECL_CMPXCHG
|
||||
|
||||
|
||||
1
stmt.h
1
stmt.h
@@ -39,6 +39,7 @@
|
||||
#define ISPC_STMT_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include "ast.h"
|
||||
|
||||
/** @brief Interface class for statements in the ispc language.
|
||||
|
||||
|
||||
7
sym.cpp
7
sym.cpp
@@ -43,13 +43,14 @@
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Symbol
|
||||
|
||||
Symbol::Symbol(const std::string &n, SourcePos p, const Type *t)
|
||||
Symbol::Symbol(const std::string &n, SourcePos p, const Type *t,
|
||||
StorageClass sc)
|
||||
: pos(p), name(n) {
|
||||
storagePtr = NULL;
|
||||
function = NULL;
|
||||
function = exportedFunction = NULL;
|
||||
type = t;
|
||||
constValue = NULL;
|
||||
isStatic = false;
|
||||
storageClass = sc;
|
||||
varyingCFDepth = 0;
|
||||
}
|
||||
|
||||
|
||||
37
sym.h
37
sym.h
@@ -41,6 +41,7 @@
|
||||
#define ISPC_SYM_H
|
||||
|
||||
#include "ispc.h"
|
||||
#include "decl.h"
|
||||
#include <map>
|
||||
|
||||
class StructType;
|
||||
@@ -63,7 +64,8 @@ class Symbol {
|
||||
public:
|
||||
/** The Symbol constructor takes the name of the symbol, its
|
||||
position in a source file, and its type (if known). */
|
||||
Symbol(const std::string &name, SourcePos pos, const Type *t = NULL);
|
||||
Symbol(const std::string &name, SourcePos pos, const Type *t = NULL,
|
||||
StorageClass sc = SC_NONE);
|
||||
|
||||
/** This method should only be called for function symbols; for them,
|
||||
it returns a mangled version of the function name with the argument
|
||||
@@ -81,6 +83,11 @@ public:
|
||||
llvm::Function *function; /*!< For symbols that represent functions,
|
||||
this stores the LLVM Function value for
|
||||
the symbol once it has been created. */
|
||||
llvm::Function *exportedFunction;
|
||||
/*!< For symbols that represent functions with
|
||||
'export' qualifiers, this points to the LLVM
|
||||
Function for the application-callable version
|
||||
of the function. */
|
||||
const Type *type; /*!< The type of the symbol; if not set by the
|
||||
constructor, this is set after the
|
||||
declaration around the symbol has been parsed. */
|
||||
@@ -93,8 +100,8 @@ public:
|
||||
storagePtr member will be its constant value. (This
|
||||
messiness is due to needing an ispc ConstExpr for the early
|
||||
constant folding optimizations). */
|
||||
bool isStatic; /*!< Records whether this symbol had a static qualifier in
|
||||
its declaration. */
|
||||
StorageClass storageClass;/*!< Records the storage class (if any) provided with the
|
||||
symbol's declaration. */
|
||||
int varyingCFDepth; /*!< This member records the number of levels of nested 'varying'
|
||||
control flow within which the symbol was declared. Having
|
||||
this value available makes it possible to avoid performing
|
||||
@@ -186,6 +193,14 @@ public:
|
||||
void GetMatchingFunctions(Predicate pred,
|
||||
std::vector<Symbol *> *matches) const;
|
||||
|
||||
/** Returns all of the variable symbols in the symbol table that match
|
||||
the given predicate. The predicate is defined as in the
|
||||
GetMatchingFunctions() method.
|
||||
*/
|
||||
template <typename Predicate>
|
||||
void GetMatchingVariables(Predicate pred,
|
||||
std::vector<Symbol *> *matches) const;
|
||||
|
||||
/** Adds the named type to the symbol table. This is used for both
|
||||
struct definitions (where <tt>struct Foo</tt> causes type \c Foo to
|
||||
be added to the symbol table) as well as for <tt>typedef</tt>s.
|
||||
@@ -251,9 +266,9 @@ private:
|
||||
};
|
||||
|
||||
|
||||
template <typename Predicate>
|
||||
void SymbolTable::GetMatchingFunctions(Predicate pred,
|
||||
std::vector<Symbol *> *matches) const {
|
||||
template <typename Predicate> void
|
||||
SymbolTable::GetMatchingFunctions(Predicate pred,
|
||||
std::vector<Symbol *> *matches) const {
|
||||
// Iterate through all function symbols and apply the given predicate.
|
||||
// If it returns true, add the Symbol * to the provided vector.
|
||||
std::map<std::string, std::vector<Symbol *> >::const_iterator iter;
|
||||
@@ -266,4 +281,14 @@ void SymbolTable::GetMatchingFunctions(Predicate pred,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename Predicate> void
|
||||
SymbolTable::GetMatchingVariables(Predicate pred,
|
||||
std::vector<Symbol *> *matches) const {
|
||||
for (unsigned int i = 0; i < variables.size(); ++i)
|
||||
for (unsigned int j = 0; j < variables[i]->size(); ++j)
|
||||
if (pred((*variables[i])[j]))
|
||||
matches->push_back((*variables[i])[j]);
|
||||
}
|
||||
|
||||
#endif // ISPC_SYM_H
|
||||
|
||||
@@ -58,23 +58,26 @@ extern "C" {
|
||||
extern void f_di(float *result, double *a, int *b);
|
||||
extern void result(float *val);
|
||||
|
||||
void ISPCLaunch(void *f, void *d);
|
||||
void ISPCSync();
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment);
|
||||
void ISPCFree(void *ptr);
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *d, int);
|
||||
void ISPCSync(void *handle);
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
}
|
||||
|
||||
void ISPCLaunch(void *f, void *d) {
|
||||
typedef void (*TaskFuncType)(void *, int, int);
|
||||
void ISPCLaunch(void **handle, void *f, void *d, int count) {
|
||||
*handle = (void *)0xdeadbeef;
|
||||
typedef void (*TaskFuncType)(void *, int, int, int, int);
|
||||
TaskFuncType func = (TaskFuncType)f;
|
||||
func(d, 0, 1);
|
||||
for (int i = 0; i < count; ++i)
|
||||
func(d, 0, 1, i, count);
|
||||
}
|
||||
|
||||
void ISPCSync() {
|
||||
void ISPCSync(void *) {
|
||||
}
|
||||
|
||||
|
||||
void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
|
||||
*handle = (void *)0xdeadbeef;
|
||||
// and now, we leak...
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
@@ -92,18 +95,6 @@ void *ISPCMalloc(int64_t size, int32_t alignment) {
|
||||
}
|
||||
|
||||
|
||||
void ISPCFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int w = width();
|
||||
|
||||
@@ -5,7 +5,8 @@ uniform unsigned int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_add_global(s, 1);
|
||||
float delta = 1;
|
||||
float b = atomic_add_global(s, delta);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
|
||||
@@ -6,8 +6,9 @@ uniform unsigned int32 s = 0;
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
float delta = 1;
|
||||
if (programIndex < 2)
|
||||
b = atomic_add_global(s, 1);
|
||||
b = atomic_add_global(s, delta);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@ uniform int64 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_add_global(s, 1);
|
||||
float delta = 1;
|
||||
float b = atomic_add_global(s, delta);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@ uniform int32 s = 0xff;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_xor_global(s, 0xfffffff0);
|
||||
int32 bits = 0xfffffff0;
|
||||
float b = atomic_xor_global(s, bits);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,8 +6,9 @@ uniform unsigned int32 s = 0;
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
int32 delta = 1;
|
||||
if (programIndex < 2)
|
||||
b = atomic_add_global(s, 1);
|
||||
b = atomic_add_global(s, delta);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
|
||||
14
tests/atomics-uniform-1.ispc
Normal file
14
tests/atomics-uniform-1.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 10;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform unsigned int32 b = atomic_add_global(s, 1);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 11;
|
||||
}
|
||||
14
tests/atomics-uniform-2.ispc
Normal file
14
tests/atomics-uniform-2.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0b1010;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform unsigned int32 b = atomic_or_global(s, 1);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0b1011;
|
||||
}
|
||||
14
tests/atomics-uniform-3.ispc
Normal file
14
tests/atomics-uniform-3.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0b1010;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform unsigned int32 b = atomic_or_global(s, 1);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0b1010;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user